Untrained model is done.
This commit is contained in:
@@ -1,11 +1,11 @@
|
||||
from pathlib import Path
|
||||
|
||||
import tiktoken
|
||||
import torch
|
||||
from llmfs.attn import MultiHeadAttention
|
||||
from llmfs.datasets.v1 import GPTDataSetV1
|
||||
from llmfs.gpt import GPTConfig
|
||||
from llmfs.gpt import DummyGPT, GPTConfig, TransformerBlock
|
||||
from llmfs.tokenizers import BPETokenizer
|
||||
|
||||
|
||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||
|
||||
|
||||
@@ -20,23 +20,61 @@ GPT_CONFIG_124M = GPTConfig(
|
||||
)
|
||||
|
||||
|
||||
def generate_text_simple(
|
||||
model: DummyGPT, idx: torch.Tensor, max_new_tokens: int, context_size: int
|
||||
) -> torch.Tensor:
|
||||
for _ in range(max_new_tokens):
|
||||
idx_cond = idx[:, -context_size:]
|
||||
with torch.no_grad():
|
||||
logits: torch.Tensor = model(idx_cond)
|
||||
logits = logits[:, -1, :]
|
||||
probs = logits.softmax(dim=-1)
|
||||
idx_next = probs.argmax(dim=-1, keepdim=True)
|
||||
idx = torch.cat((idx, idx_next), dim=1)
|
||||
return idx
|
||||
|
||||
|
||||
def process_text(text: str):
|
||||
tokenizer = BPETokenizer.build(text)
|
||||
print("Buiding tokenizer")
|
||||
# tokenizer = BPETokenizer.build(text)
|
||||
tokenizer = tiktoken.encoding_for_model("gpt2")
|
||||
vocab_size = tokenizer.max_token_value + 1
|
||||
max_len = 4
|
||||
ctx_len = max_len
|
||||
output_dim = 256
|
||||
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
|
||||
pos_embedding_layer = torch.nn.Embedding(ctx_len, output_dim)
|
||||
pos_embeddings = pos_embedding_layer(torch.arange(ctx_len))
|
||||
dataset = GPTDataSetV1.data_loader(
|
||||
text,
|
||||
tokenizer,
|
||||
batch_size=8,
|
||||
max_len=4,
|
||||
stride=1,
|
||||
shuffle=False,
|
||||
print(f"Tokenizer is ready. Vocab size: {vocab_size}")
|
||||
batch = torch.stack(
|
||||
[
|
||||
torch.tensor(tokenizer.encode("Every effort moves you")),
|
||||
torch.tensor(tokenizer.encode("Every day holds a")),
|
||||
],
|
||||
dim=0,
|
||||
)
|
||||
cfg = GPTConfig(
|
||||
vocab_size=vocab_size,
|
||||
context_length=1024,
|
||||
embedding_dim=768,
|
||||
n_heads=12,
|
||||
n_layers=12,
|
||||
dropout=0.1,
|
||||
qkv_bias=False,
|
||||
)
|
||||
gpt = DummyGPT(cfg)
|
||||
gpt.eval()
|
||||
start_ctx = "Hello, I am"
|
||||
encoded = tokenizer.encode(start_ctx)
|
||||
encoded_tensor = torch.tensor(encoded).unsqueeze(0)
|
||||
out = generate_text_simple(gpt, encoded_tensor, 6, cfg.context_length)
|
||||
decoded_text = tokenizer.decode(out.squeeze(0).tolist())
|
||||
print(decoded_text)
|
||||
# logits = gpt(batch)
|
||||
# print(logits)
|
||||
# print(logits.shape)
|
||||
# dataset = GPTDataSetV1.data_loader(
|
||||
# text,
|
||||
# tokenizer,
|
||||
# batch_size=8,
|
||||
# max_len=4,
|
||||
# stride=1,
|
||||
# shuffle=False,
|
||||
# )
|
||||
# for inps, targs in iter(dataset):
|
||||
# embeds = token_embedding_layer(inps)
|
||||
# print(embeds.shape)
|
||||
@@ -46,32 +84,9 @@ def process_text(text: str):
|
||||
# tokenizer = BPETokenizer.build(text)
|
||||
|
||||
|
||||
def attn_test():
|
||||
inps = torch.Tensor(
|
||||
[
|
||||
[0.43, 0.15, 0.89],
|
||||
[0.55, 0.87, 0.66],
|
||||
[0.57, 0.85, 0.64],
|
||||
[0.22, 0.58, 0.43],
|
||||
[0.77, 0.25, 0.10],
|
||||
[0.05, 0.80, 0.55],
|
||||
]
|
||||
)
|
||||
batch = torch.stack((inps, inps), dim=0)
|
||||
attn = MultiHeadAttention(
|
||||
inps.shape[1],
|
||||
8,
|
||||
inps.shape[0],
|
||||
dropout=True,
|
||||
num_heads=2,
|
||||
)
|
||||
print(attn(batch))
|
||||
|
||||
|
||||
def main():
|
||||
raw_text = (DATA_DIR / "the-verdict.txt").read_text()
|
||||
# process_text(raw_text)
|
||||
attn_test()
|
||||
process_text(raw_text)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
108
llmfs/gpt.py
108
llmfs/gpt.py
@@ -2,6 +2,8 @@ from dataclasses import dataclass
|
||||
|
||||
import torch
|
||||
|
||||
from llmfs.attn import MultiHeadAttention
|
||||
|
||||
|
||||
@dataclass
|
||||
class GPTConfig:
|
||||
@@ -14,8 +16,91 @@ class GPTConfig:
|
||||
qkv_bias: bool
|
||||
|
||||
|
||||
class DummyGPT:
|
||||
class DummyTransformerBlock(torch.nn.Module):
|
||||
def __init__(self, config: GPTConfig):
|
||||
super().__init__()
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return x
|
||||
|
||||
|
||||
class GELU(torch.nn.Module):
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return (
|
||||
0.5
|
||||
* x
|
||||
* (
|
||||
1
|
||||
+ torch.tanh(
|
||||
torch.sqrt(torch.tensor(2.0 / torch.pi))
|
||||
* (x + 0.44715 * torch.pow(x, 3))
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class FeedForward(torch.nn.Module):
|
||||
def __init__(self, cfg: GPTConfig) -> None:
|
||||
super().__init__()
|
||||
self.layers = torch.nn.Sequential(
|
||||
torch.nn.Linear(cfg.embedding_dim, 4 * cfg.embedding_dim),
|
||||
GELU(),
|
||||
torch.nn.Linear(cfg.embedding_dim * 4, cfg.embedding_dim),
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.layers(x)
|
||||
|
||||
|
||||
class TransformerBlock(torch.nn.Module):
|
||||
def __init__(self, cfg: GPTConfig) -> None:
|
||||
super().__init__()
|
||||
self.att = MultiHeadAttention(
|
||||
cfg.embedding_dim,
|
||||
cfg.embedding_dim,
|
||||
cfg.context_length,
|
||||
cfg.dropout,
|
||||
cfg.qkv_bias,
|
||||
)
|
||||
self.ff = FeedForward(cfg)
|
||||
self.norm1 = NormLayer(cfg.embedding_dim)
|
||||
self.norm2 = NormLayer(cfg.embedding_dim)
|
||||
self.dropout = torch.nn.Dropout(cfg.dropout)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
shortcut = x
|
||||
x = self.norm1(x)
|
||||
x = self.att(x)
|
||||
x = self.dropout(x)
|
||||
x = x + shortcut
|
||||
|
||||
shortcut = x
|
||||
x = self.norm2(x)
|
||||
x = self.ff(x)
|
||||
x = self.dropout(x)
|
||||
x = x + shortcut
|
||||
return x
|
||||
|
||||
|
||||
class NormLayer(torch.nn.Module):
|
||||
def __init__(self, dim: int, eps: float = 1e-5):
|
||||
super().__init__()
|
||||
self.dim = dim
|
||||
self.eps = eps
|
||||
self.scale = torch.nn.Parameter(torch.ones(dim))
|
||||
self.shift = torch.nn.Parameter(torch.zeros(dim))
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
mean = x.mean(-1, keepdim=True)
|
||||
var = x.var(-1, keepdim=True, unbiased=True)
|
||||
# Makes mean = 0 and variance = 1
|
||||
norm_x = (x - mean) / torch.sqrt(var + self.eps)
|
||||
return self.scale * norm_x + self.shift
|
||||
|
||||
|
||||
class DummyGPT(torch.nn.Module):
|
||||
def __init__(self, config: GPTConfig):
|
||||
super().__init__()
|
||||
self.tok_embedding = torch.nn.Embedding(
|
||||
config.vocab_size,
|
||||
config.embedding_dim,
|
||||
@@ -24,4 +109,23 @@ class DummyGPT:
|
||||
config.context_length,
|
||||
config.embedding_dim,
|
||||
)
|
||||
self.dropout = torch.nn.Dropout(config.dropout)
|
||||
self.drop_emb = torch.nn.Dropout(config.dropout)
|
||||
self.trf_blocks = torch.nn.Sequential(
|
||||
*[TransformerBlock(config) for _ in range(config.n_layers)]
|
||||
)
|
||||
self.final_norm = NormLayer(config.embedding_dim)
|
||||
self.out_head = torch.nn.Linear(
|
||||
config.embedding_dim,
|
||||
config.vocab_size,
|
||||
bias=False,
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
_, seq_len = x.shape
|
||||
res = self.tok_embedding(x) + self.pos_embedding(
|
||||
torch.arange(seq_len, device=x.device)
|
||||
)
|
||||
res = self.drop_emb(res)
|
||||
res = self.trf_blocks(res)
|
||||
res = self.final_norm(res)
|
||||
return self.out_head(res)
|
||||
|
||||
Reference in New Issue
Block a user