0% found this document useful (0 votes)
223 views7 pages

Karpathy MinGPT Model

Uploaded by

Chong Zhang
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
223 views7 pages

Karpathy MinGPT Model

Uploaded by

Chong Zhang
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

2023/6/28 14:08 minGPT/mingpt/model.

py at master · karpathy/minGPT · GitHub

karpathy / minGPT Public

Code Issues 32 Pull requests 23 Actions Projects Security Ins

master

minGPT / mingpt / model.py / Jump to

mishig25 Use XOR operator ^ for checking assertion `type_given XOR params_gi… …

3 contributors

310 lines (272 sloc) 14.3 KB

1 """
2 Full definition of a GPT Language Model, all of it in this single file.
3
4 References:
5 1) the official GPT-2 TensorFlow implementation released by OpenAI:
6 https://github.com/openai/gpt-2/blob/master/src/model.py
7 2) huggingface/transformers PyTorch implementation:
8 https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt
9 """
10
11 import math
12
13 import torch
14 import torch.nn as nn
15 from torch.nn import functional as F
16
17 from mingpt.utils import CfgNode as CN
18
19 # -----------------------------------------------------------------------------
20
21 class NewGELU(nn.Module):
22 """
23 Implementation of the GELU activation function currently in Google BERT repo (identical to
24 Reference: Gaussian Error Linear Units (GELU) paper: https://arxiv.org/abs/1606.08415
25 """
26 def forward(self, x):
27 return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow
28
29 class CausalSelfAttention(nn.Module):
30 """
31 A vanilla multi-head masked self-attention layer with a projection at the end.
32 It is possible to use torch.nn.MultiheadAttention here but I am including an
33 explicit implementation here to show that there is nothing too scary here.
https://github.com/karpathy/minGPT/blob/master/mingpt/model.py 1/7
2023/6/28 14:08 minGPT/mingpt/model.py at master · karpathy/minGPT · GitHub

34 """
35
36 def __init__(self, config):
37 super().__init__()
38 assert config.n_embd % config.n_head == 0
39 # key, query, value projections for all heads, but in a batch
40 self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
41 # output projection
42 self.c_proj = nn.Linear(config.n_embd, config.n_embd)
43 # regularization
44 self.attn_dropout = nn.Dropout(config.attn_pdrop)
45 self.resid_dropout = nn.Dropout(config.resid_pdrop)
46 # causal mask to ensure that attention is only applied to the left in the input sequenc
47 self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size
48 .view(1, 1, config.block_size, config.block_size))
49 self.n_head = config.n_head
50 self.n_embd = config.n_embd
51
52 def forward(self, x):
53 B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
54
55 # calculate query, key, values for all heads in batch and move head forward to be the b
56 q, k ,v = self.c_attn(x).split(self.n_embd, dim=2)
57 k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
58 q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
59 v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
60
61 # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
62 att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
63 att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
64 att = F.softmax(att, dim=-1)
65 att = self.attn_dropout(att)
66 y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
67 y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by
68
69 # output projection
70 y = self.resid_dropout(self.c_proj(y))
71 return y
72
73 class Block(nn.Module):
74 """ an unassuming Transformer block """
75
76 def __init__(self, config):
77 super().__init__()
78 self.ln_1 = nn.LayerNorm(config.n_embd)
79 self.attn = CausalSelfAttention(config)
80 self.ln_2 = nn.LayerNorm(config.n_embd)
81 self.mlp = nn.ModuleDict(dict(
82 c_fc = nn.Linear(config.n_embd, 4 * config.n_embd),
83 c_proj = nn.Linear(4 * config.n_embd, config.n_embd),
84 act = NewGELU(),
85 dropout = nn.Dropout(config.resid_pdrop),

https://github.com/karpathy/minGPT/blob/master/mingpt/model.py 2/7
2023/6/28 14:08 minGPT/mingpt/model.py at master · karpathy/minGPT · GitHub

86 ))
87 m = self.mlp
88 self.mlpf = lambda x: m.dropout(m.c_proj(m.act(m.c_fc(x)))) # MLP forward
89
90 def forward(self, x):
91 x = x + self.attn(self.ln_1(x))
92 x = x + self.mlpf(self.ln_2(x))
93 return x
94
95 class GPT(nn.Module):
96 """ GPT Language Model """
97
98 @staticmethod
99 def get_default_config():
100 C = CN()
101 # either model_type or (n_layer, n_head, n_embd) must be given in the config
102 C.model_type = 'gpt'
103 C.n_layer = None
104 C.n_head = None
105 C.n_embd = None
106 # these options must be filled in externally
107 C.vocab_size = None
108 C.block_size = None
109 # dropout hyperparameters
110 C.embd_pdrop = 0.1
111 C.resid_pdrop = 0.1
112 C.attn_pdrop = 0.1
113 return C
114
115 def __init__(self, config):
116 super().__init__()
117 assert config.vocab_size is not None
118 assert config.block_size is not None
119 self.block_size = config.block_size
120
121 type_given = config.model_type is not None
122 params_given = all([config.n_layer is not None, config.n_head is not None, config.n_emb
123 assert type_given ^ params_given # exactly one of these (XOR)
124 if type_given:
125 # translate from model_type to detailed configuration
126 config.merge_from_dict({
127 # names follow the huggingface naming conventions
128 # GPT-1
129 'openai-gpt': dict(n_layer=12, n_head=12, n_embd=768), # 117M params
130 # GPT-2 configs
131 'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
132 'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
133 'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
134 'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
135 # Gophers
136 'gopher-44m': dict(n_layer=8, n_head=16, n_embd=512),
137 # (there are a number more...)

https://github.com/karpathy/minGPT/blob/master/mingpt/model.py 3/7
2023/6/28 14:08 minGPT/mingpt/model.py at master · karpathy/minGPT · GitHub

138 # I made these tiny models up


139 'gpt-mini': dict(n_layer=6, n_head=6, n_embd=192),
140 'gpt-micro': dict(n_layer=4, n_head=4, n_embd=128),
141 'gpt-nano': dict(n_layer=3, n_head=3, n_embd=48),
142 }[config.model_type])
143
144 self.transformer = nn.ModuleDict(dict(
145 wte = nn.Embedding(config.vocab_size, config.n_embd),
146 wpe = nn.Embedding(config.block_size, config.n_embd),
147 drop = nn.Dropout(config.embd_pdrop),
148 h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
149 ln_f = nn.LayerNorm(config.n_embd),
150 ))
151 self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
152
153 # init all weights, and apply a special scaled init to the residual projections, per GP
154 self.apply(self._init_weights)
155 for pn, p in self.named_parameters():
156 if pn.endswith('c_proj.weight'):
157 torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
158
159 # report number of parameters (note we don't count the decoder parameters in lm_head)
160 n_params = sum(p.numel() for p in self.transformer.parameters())
161 print("number of parameters: %.2fM" % (n_params/1e6,))
162
163 def _init_weights(self, module):
164 if isinstance(module, nn.Linear):
165 torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
166 if module.bias is not None:
167 torch.nn.init.zeros_(module.bias)
168 elif isinstance(module, nn.Embedding):
169 torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
170 elif isinstance(module, nn.LayerNorm):
171 torch.nn.init.zeros_(module.bias)
172 torch.nn.init.ones_(module.weight)
173
174 @classmethod
175 def from_pretrained(cls, model_type):
176 """
177 Initialize a pretrained GPT model by copying over the weights
178 from a huggingface/transformers checkpoint.
179 """
180 assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
181 from transformers import GPT2LMHeadModel
182
183 # create a from-scratch initialized minGPT model
184 config = cls.get_default_config()
185 config.model_type = model_type
186 config.vocab_size = 50257 # openai's model vocabulary
187 config.block_size = 1024 # openai's model block_size
188 model = GPT(config)
189 sd = model.state_dict()

https://github.com/karpathy/minGPT/blob/master/mingpt/model.py 4/7
2023/6/28 14:08 minGPT/mingpt/model.py at master · karpathy/minGPT · GitHub

190
191 # init a huggingface/transformers model
192 model_hf = GPT2LMHeadModel.from_pretrained(model_type)
193 sd_hf = model_hf.state_dict()
194
195 # copy while ensuring all of the parameters are aligned and match in names and shapes
196 keys = [k for k in sd_hf if not k.endswith('attn.masked_bias')] # ignore these
197 transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_pro
198 # basically the openai checkpoints use a "Conv1D" module, but we only want to use a van
199 # this means that we have to transpose these weights when we import them
200 assert len(keys) == len(sd)
201 for k in keys:
202 if any(k.endswith(w) for w in transposed):
203 # special treatment for the Conv1D weights we need to transpose
204 assert sd_hf[k].shape[::-1] == sd[k].shape
205 with torch.no_grad():
206 sd[k].copy_(sd_hf[k].t())
207 else:
208 # vanilla copy over the other parameters
209 assert sd_hf[k].shape == sd[k].shape
210 with torch.no_grad():
211 sd[k].copy_(sd_hf[k])
212
213 return model
214
215 def configure_optimizers(self, train_config):
216 """
217 This long function is unfortunately doing something very simple and is being very defen
218 We are separating out all parameters of the model into two buckets: those that will exp
219 weight decay for regularization and those that won't (biases, and layernorm/embedding w
220 We are then returning the PyTorch optimizer object.
221 """
222
223 # separate out all parameters to those that will and won't experience regularizing weig
224 decay = set()
225 no_decay = set()
226 whitelist_weight_modules = (torch.nn.Linear, )
227 blacklist_weight_modules = (torch.nn.LayerNorm, torch.nn.Embedding)
228 for mn, m in self.named_modules():
229 for pn, p in m.named_parameters():
230 fpn = '%s.%s' % (mn, pn) if mn else pn # full param name
231 # random note: because named_modules and named_parameters are recursive
232 # we will see the same tensors p many many times. but doing it this way
233 # allows us to know which parent module any tensor p belongs to...
234 if pn.endswith('bias'):
235 # all biases will not be decayed
236 no_decay.add(fpn)
237 elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
238 # weights of whitelist modules will be weight decayed
239 decay.add(fpn)
240 elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
241 # weights of blacklist modules will NOT be weight decayed

https://github.com/karpathy/minGPT/blob/master/mingpt/model.py 5/7
2023/6/28 14:08 minGPT/mingpt/model.py at master · karpathy/minGPT · GitHub

242 no_decay.add(fpn)
243
244 # validate that we considered every parameter
245 param_dict = {pn: p for pn, p in self.named_parameters()}
246 inter_params = decay & no_decay
247 union_params = decay | no_decay
248 assert len(inter_params) == 0, "parameters %s made it into both decay/no_decay sets!" %
249 assert len(param_dict.keys() - union_params) == 0, "parameters %s were not separated in
250 % (str(param_dict.keys() - union_params), )
251
252 # create the pytorch optimizer object
253 optim_groups = [
254 {"params": [param_dict[pn] for pn in sorted(list(decay))], "weight_decay": train_co
255 {"params": [param_dict[pn] for pn in sorted(list(no_decay))], "weight_decay": 0.0},
256 ]
257 optimizer = torch.optim.AdamW(optim_groups, lr=train_config.learning_rate, betas=train_
258 return optimizer
259
260 def forward(self, idx, targets=None):
261 device = idx.device
262 b, t = idx.size()
263 assert t <= self.block_size, f"Cannot forward sequence of length {t}, block size is onl
264 pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
265
266 # forward the GPT model itself
267 tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
268 pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
269 x = self.transformer.drop(tok_emb + pos_emb)
270 for block in self.transformer.h:
271 x = block(x)
272 x = self.transformer.ln_f(x)
273 logits = self.lm_head(x)
274
275 # if we are given some desired targets also calculate the loss
276 loss = None
277 if targets is not None:
278 loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_i
279
280 return logits, loss
281
282 @torch.no_grad()
283 def generate(self, idx, max_new_tokens, temperature=1.0, do_sample=False, top_k=None):
284 """
285 Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
286 the sequence max_new_tokens times, feeding the predictions back into the model each tim
287 Most likely you'll want to make sure to be in model.eval() mode of operation for this.
288 """
289 for _ in range(max_new_tokens):
290 # if the sequence context is growing too long we must crop it at block_size
291 idx_cond = idx if idx.size(1) <= self.block_size else idx[:, -self.block_size:]
292 # forward the model to get the logits for the index in the sequence
293 logits, _ = self(idx_cond)

https://github.com/karpathy/minGPT/blob/master/mingpt/model.py 6/7
2023/6/28 14:08 minGPT/mingpt/model.py at master · karpathy/minGPT · GitHub

294 # pluck the logits at the final step and scale by desired temperature
295 logits = logits[:, -1, :] / temperature
296 # optionally crop the logits to only the top k options
297 if top_k is not None:
298 v, _ = torch.topk(logits, top_k)
299 logits[logits < v[:, [-1]]] = -float('Inf')
300 # apply softmax to convert logits to (normalized) probabilities
301 probs = F.softmax(logits, dim=-1)
302 # either sample from the distribution or take the most likely element
303 if do_sample:
304 idx_next = torch.multinomial(probs, num_samples=1)
305 else:
306 _, idx_next = torch.topk(probs, k=1, dim=-1)
307 # append sampled index to the running sequence and continue
308 idx = torch.cat((idx, idx_next), dim=1)
309
310 return idx

https://github.com/karpathy/minGPT/blob/master/mingpt/model.py 7/7

You might also like