NLP 4
NLP 4
In [1]:
import torch.nn as nn
import torch
import torch.nn.functional as F
import math,copy,re
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import torchtext
import matplotlib.pyplot as plt
warnings.simplefilter("ignore")
print(torch.__version__)
2.0.0+cu118
In [2]:
class Embedding(nn.Module):
def __init__(self, vocab_size, embed_dim):
"""
Args:
vocab_size: size of vocabulary
embed_dim: dimension of embeddings
"""
super(Embedding, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_dim)
def forward(self, x):
"""
Args:
x: input vector
Returns:
out: embedding vector
"""
out = self.embed(x)
return out
localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 1/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook
In [4]:
class PositionalEmbedding(nn.Module):
def __init__(self,max_seq_len,embed_model_dim):
"""
Args:
seq_len: length of input sequence
embed_model_dim: demension of embedding
"""
super(PositionalEmbedding, self).__init__()
self.embed_dim = embed_model_dim
pe = torch.zeros(max_seq_len,self.embed_dim)
for pos in range(max_seq_len):
for i in range(0,self.embed_dim,2):
pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/self.embed_dim)))
pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/self.embed_dim))
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 2/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook
localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 3/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook
In [5]:
class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim=512, n_heads=8):
"""
Args:
embed_dim: dimension of embeding vector output
n_heads: number of self attention heads
"""
super(MultiHeadAttention, self).__init__()
def forward(self,key,query,value,mask=None):
"""
Args:
key : key vector
query : query vector
value : value vector
mask: mask for decoder
Returns:
output vector from multihead attention
"""
batch_size = key.size(0)
seq_length = key.size(1)
seq_length_query = query.size(1)
# 32x10x512
key = key.view(batch_size, seq_length, self.n_heads, self.single_head_dim) #batc
query = query.view(batch_size, seq_length_query, self.n_heads, self.single_head_d
value = value.view(batch_size, seq_length, self.n_heads, self.single_head_dim) #(
k = self.key_matrix(key) # (32x10x8x64)
q = self.query_matrix(query)
v = self.value_matrix(value)
q = q.transpose(1,2) # (batch_size, n_heads, seq_len, single_head_dim) # (32
k = k.transpose(1,2) # (batch_size, n_heads, seq_len, single_head_dim)
v = v.transpose(1,2) # (batch_size, n_heads, seq_len, single_head_dim)
# computes attention
# adjust key for matrix multiplication
k_adjusted = k.transpose(-1,-2) #(batch_size, n_heads, single_head_dim, seq_ken)
product = torch.matmul(q, k_adjusted) #(32 x 8 x 10 x 64) x (32 x 8 x 64 x 10) =
if mask is not None:
product = product.masked_fill(mask == 0, float("-1e20"))
#applying softmax
scores = F.softmax(product, dim=-1)
return output
In [8]:
class TransformerBlock(nn.Module):
def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
super(TransformerBlock, self).__init__()
self.attention = MultiHeadAttention(embed_dim, n_heads)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.feed_forward = nn.Sequential(nn.Linear(embed_dim, expansion_factor*embed_dim),nn
self.dropout1 = nn.Dropout(0.2)
self.dropout2 = nn.Dropout(0.2)
def forward(self,key,query,value):
attention_out = self.attention(key,query,value) #32x10x512
attention_residual_out = attention_out + value #32x10x512
norm1_out = self.dropout1(self.norm1(attention_residual_out)) #32x10x512
feed_fwd_out = self.feed_forward(norm1_out) #32x10x512 -> #32x10x2048 -> 32x10x512
feed_fwd_residual_out = feed_fwd_out + norm1_out #32x10x512
norm2_out = self.dropout2(self.norm2(feed_fwd_residual_out)) #32x10x512
return norm2_out
class TransformerEncoder(nn.Module):
def __init__(self, seq_len, vocab_size, embed_dim, num_layers=2, expansion_factor=4, n_
super(TransformerEncoder, self).__init__()
self.embedding_layer = Embedding(vocab_size, embed_dim)
self.positional_encoder = PositionalEmbedding(seq_len, embed_dim)
self.layers = nn.ModuleList([TransformerBlock(embed_dim, expansion_factor, n_heads) f
localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 5/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook
In [9]:
class DecoderBlock(nn.Module):
def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
super(DecoderBlock, self).__init__()
self.attention = MultiHeadAttention(embed_dim, n_heads=8)
self.norm = nn.LayerNorm(embed_dim)
self.dropout = nn.Dropout(0.2)
self.transformer_block = TransformerBlock(embed_dim, expansion_factor, n_heads)
def forward(self, key, query, x,mask):
attention = self.attention(x,x,x,mask=mask) #32x10x512
value = self.dropout(self.norm(attention + x))
out = self.transformer_block(key, query, value)
return out
class TransformerDecoder(nn.Module):
def __init__(self, target_vocab_size, embed_dim, seq_len, num_layers=2, expansion_facto
super(TransformerDecoder, self).__init__()
self.word_embedding = nn.Embedding(target_vocab_size, embed_dim)
self.position_embedding = PositionalEmbedding(seq_len, embed_dim)
self.layers = nn.ModuleList([DecoderBlock(embed_dim, expansion_factor=4, n_heads=8) f
self.fc_out = nn.Linear(embed_dim, target_vocab_size)
self.dropout = nn.Dropout(0.2)
def forward(self, x, enc_out, mask):
x = self.word_embedding(x) #32x10x512
x = self.position_embedding(x) #32x10x512
x = self.dropout(x)
localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 6/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook
In [10]:
class Transformer(nn.Module):
def __init__(self, embed_dim, src_vocab_size, target_vocab_size, seq_length,num_layer
super(Transformer, self).__init__()
self.target_vocab_size = target_vocab_size
out = out.argmax(-1)
out_labels.append(out.item())
out = torch.unsqueeze(out,axis=0)
return out_labels
def forward(self, src, trg):
trg_mask = self.make_trg_mask(trg)
enc_out = self.encoder(src)
localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 7/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook
In [11]:
src_vocab_size = 11
target_vocab_size = 11
num_layers = 6
seq_length= 12
print(src.shape,target.shape)
model = Transformer(embed_dim=512, src_vocab_size=src_vocab_size,
target_vocab_size=target_vocab_size, seq_length=seq_length,
num_layers=num_layers, expansion_factor=4, n_heads=8)
model
localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 8/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook
Out[11]:
In [12]:
Out[12]:
localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 9/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook
Transformer(
In [13]:
(encoder): TransformerEncoder(
model(embedding_layer): Embedding( src_vocab_size=src_vocab_size,
= Transformer(embed_dim=512,
(embed): Embedding(11, 512)
target_vocab_size=target_vocab_size, seq_length=seq_length,
) num_layers=num_layers, expansion_factor=4, n_heads=8)
(positional_encoder): PositionalEmbedding()
(layers): ModuleList(
(0-5): 6 x TransformerBlock(
(attention): MultiHeadAttention(
src = torch.tensor([[0, 2, 5, 6, 4, 3, 9, 5, 2, 9, 10, 1]])
(query_matrix): Linear(in_features=64, out_features=64, bias=Fal
trg = torch.tensor([[0]])
se)
print(src.shape,trg.shape)
(key_matrix): trg)
out = model.decode(src, Linear(in_features=64, out_features=64, bias=Fals
e)
out
(value_matrix): Linear(in_features=64, out_features=64, bias=Fal
torch.Size([1,
se) 12]) torch.Size([1, 1])
(out): Linear(in_features=512, out_features=512, bias=True)
Out[13]:
)
[0, 0, 0,(norm1): LayerNorm((512,),
0, 0, 0, eps=1e-05, elementwise_affine=True)
0, 0, 0, 0, 0, 0]
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(feed_forward): Sequential(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): ReLU()
(2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout1): Dropout(p=0.2, inplace=False)
(dropout2): Dropout(p=0.2, inplace=False)
localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 10/10