0% found this document useful (0 votes)
9 views10 pages

NLP 4

The document is a Jupyter Notebook implementing a Transformer model for Natural Language Processing using PyTorch. It includes classes for embedding, positional encoding, multi-head attention, and both encoder and decoder blocks, culminating in a complete Transformer architecture. The notebook also demonstrates the model's initialization and a sample input-output operation.

Uploaded by

Pratiksha Darade
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
9 views10 pages

NLP 4

The document is a Jupyter Notebook implementing a Transformer model for Natural Language Processing using PyTorch. It includes classes for embedding, positional encoding, multi-head attention, and both encoder and decoder blocks, culminating in a complete Transformer architecture. The notebook also demonstrates the model's initialization and a sample input-output operation.

Uploaded by

Pratiksha Darade
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 10

4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [1]:

import torch.nn as nn
import torch
import torch.nn.functional as F
import math,copy,re
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import torchtext
import matplotlib.pyplot as plt
warnings.simplefilter("ignore")
print(torch.__version__)

2.0.0+cu118

In [2]:

class Embedding(nn.Module):
def __init__(self, vocab_size, embed_dim):
"""
Args:
vocab_size: size of vocabulary
embed_dim: dimension of embeddings
"""
super(Embedding, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_dim)
def forward(self, x):
"""
Args:
x: input vector
Returns:
out: embedding vector
"""
out = self.embed(x)
return out

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 1/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [4]:

class PositionalEmbedding(nn.Module):
def __init__(self,max_seq_len,embed_model_dim):
"""
Args:
seq_len: length of input sequence
embed_model_dim: demension of embedding
"""
super(PositionalEmbedding, self).__init__()
self.embed_dim = embed_model_dim

pe = torch.zeros(max_seq_len,self.embed_dim)
for pos in range(max_seq_len):
for i in range(0,self.embed_dim,2):
pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/self.embed_dim)))
pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/self.embed_dim))
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)

def forward(self, x):


"""
Args:
x: input vector
Returns:
x: output
"""
x = x * math.sqrt(self.embed_dim)
#add constant to embedding
seq_len = x.size(1)
x = x + torch.autograd.Variable(self.pe[:,:seq_len], requires_grad=False)
return x

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 2/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 3/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [5]:

class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim=512, n_heads=8):
"""
Args:
embed_dim: dimension of embeding vector output
n_heads: number of self attention heads
"""
super(MultiHeadAttention, self).__init__()

self.embed_dim = embed_dim #512 dim


self.n_heads = n_heads #8
self.single_head_dim = int(self.embed_dim / self.n_heads)
self.query_matrix = nn.Linear(self.single_head_dim , self.single_head_dim ,bias=F
self.key_matrix = nn.Linear(self.single_head_dim , self.single_head_dim, bias=Fa
self.value_matrix = nn.Linear(self.single_head_dim ,self.single_head_dim , bias=F
self.out = nn.Linear(self.n_heads*self.single_head_dim ,self.embed_dim)

def forward(self,key,query,value,mask=None):

"""
Args:
key : key vector
query : query vector
value : value vector
mask: mask for decoder

Returns:
output vector from multihead attention
"""
batch_size = key.size(0)
seq_length = key.size(1)
seq_length_query = query.size(1)

# 32x10x512
key = key.view(batch_size, seq_length, self.n_heads, self.single_head_dim) #batc
query = query.view(batch_size, seq_length_query, self.n_heads, self.single_head_d
value = value.view(batch_size, seq_length, self.n_heads, self.single_head_dim) #(

k = self.key_matrix(key) # (32x10x8x64)
q = self.query_matrix(query)
v = self.value_matrix(value)
q = q.transpose(1,2) # (batch_size, n_heads, seq_len, single_head_dim) # (32
k = k.transpose(1,2) # (batch_size, n_heads, seq_len, single_head_dim)
v = v.transpose(1,2) # (batch_size, n_heads, seq_len, single_head_dim)

# computes attention
# adjust key for matrix multiplication
k_adjusted = k.transpose(-1,-2) #(batch_size, n_heads, single_head_dim, seq_ken)
product = torch.matmul(q, k_adjusted) #(32 x 8 x 10 x 64) x (32 x 8 x 64 x 10) =
if mask is not None:
product = product.masked_fill(mask == 0, float("-1e20"))

#divising by square root of key dimension


product = product / math.sqrt(self.single_head_dim) # / sqrt(64)

#applying softmax
scores = F.softmax(product, dim=-1)

#mutiply with value matrix


localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 4/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

scores = torch.matmul(scores, v) ##(32x8x 10x 10) x (32 x 8 x 10 x 64) = (32 x 8


concat = scores.transpose(1,2).contiguous().view(batch_size, seq_length_query, se

output = self.out(concat) #(32,10,512) -> (32,10,512)

return output

In [8]:

class TransformerBlock(nn.Module):
def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
super(TransformerBlock, self).__init__()
self.attention = MultiHeadAttention(embed_dim, n_heads)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.feed_forward = nn.Sequential(nn.Linear(embed_dim, expansion_factor*embed_dim),nn
self.dropout1 = nn.Dropout(0.2)
self.dropout2 = nn.Dropout(0.2)

def forward(self,key,query,value):
attention_out = self.attention(key,query,value) #32x10x512
attention_residual_out = attention_out + value #32x10x512
norm1_out = self.dropout1(self.norm1(attention_residual_out)) #32x10x512
feed_fwd_out = self.feed_forward(norm1_out) #32x10x512 -> #32x10x2048 -> 32x10x512
feed_fwd_residual_out = feed_fwd_out + norm1_out #32x10x512
norm2_out = self.dropout2(self.norm2(feed_fwd_residual_out)) #32x10x512
return norm2_out

class TransformerEncoder(nn.Module):
def __init__(self, seq_len, vocab_size, embed_dim, num_layers=2, expansion_factor=4, n_
super(TransformerEncoder, self).__init__()
self.embedding_layer = Embedding(vocab_size, embed_dim)
self.positional_encoder = PositionalEmbedding(seq_len, embed_dim)
self.layers = nn.ModuleList([TransformerBlock(embed_dim, expansion_factor, n_heads) f

def forward(self, x):


embed_out = self.embedding_layer(x)
out = self.positional_encoder(embed_out)
for layer in self.layers:
out = layer(out,out,out)
return out

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 5/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [9]:

class DecoderBlock(nn.Module):
def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
super(DecoderBlock, self).__init__()
self.attention = MultiHeadAttention(embed_dim, n_heads=8)
self.norm = nn.LayerNorm(embed_dim)
self.dropout = nn.Dropout(0.2)
self.transformer_block = TransformerBlock(embed_dim, expansion_factor, n_heads)
def forward(self, key, query, x,mask):
attention = self.attention(x,x,x,mask=mask) #32x10x512
value = self.dropout(self.norm(attention + x))
out = self.transformer_block(key, query, value)
return out

class TransformerDecoder(nn.Module):
def __init__(self, target_vocab_size, embed_dim, seq_len, num_layers=2, expansion_facto
super(TransformerDecoder, self).__init__()
self.word_embedding = nn.Embedding(target_vocab_size, embed_dim)
self.position_embedding = PositionalEmbedding(seq_len, embed_dim)
self.layers = nn.ModuleList([DecoderBlock(embed_dim, expansion_factor=4, n_heads=8) f
self.fc_out = nn.Linear(embed_dim, target_vocab_size)
self.dropout = nn.Dropout(0.2)
def forward(self, x, enc_out, mask):
x = self.word_embedding(x) #32x10x512
x = self.position_embedding(x) #32x10x512
x = self.dropout(x)

for layer in self.layers:


x = layer(enc_out, x, enc_out, mask)
out = F.softmax(self.fc_out(x))
return out

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 6/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [10]:

class Transformer(nn.Module):
def __init__(self, embed_dim, src_vocab_size, target_vocab_size, seq_length,num_layer
super(Transformer, self).__init__()
self.target_vocab_size = target_vocab_size

self.encoder = TransformerEncoder(seq_length, src_vocab_size, embed_dim, num_laye


self.decoder = TransformerDecoder(target_vocab_size, embed_dim, seq_length, num_l
def make_trg_mask(self, trg):
batch_size, trg_len = trg.shape
trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(batch_size, 1, trg_len
return trg_mask
def decode(self,src,trg):
trg_mask = self.make_trg_mask(trg)
enc_out = self.encoder(src)
out_labels = []
batch_size,seq_len = src.shape[0],src.shape[1]
out = trg
for i in range(seq_len):
out = self.decoder(out,enc_out,trg_mask) #bs x seq_len x vocab_dim
# taking the last token
out = out[:,-1,:]

out = out.argmax(-1)
out_labels.append(out.item())
out = torch.unsqueeze(out,axis=0)
return out_labels
def forward(self, src, trg):
trg_mask = self.make_trg_mask(trg)
enc_out = self.encoder(src)

outputs = self.decoder(trg, enc_out, trg_mask)


return outputs

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 7/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [11]:

src_vocab_size = 11
target_vocab_size = 11
num_layers = 6
seq_length= 12

# let 0 be sos token and 1 be eos token


src = torch.tensor([[0, 2, 5, 6, 4, 3, 9, 5, 2, 9, 10, 1],
[0, 2, 8, 7, 3, 4, 5, 6, 7, 2, 10, 1]])
target = torch.tensor([[0, 1, 7, 4, 3, 5, 9, 2, 8, 10, 9, 1],
[0, 1, 5, 6, 2, 4, 7, 6, 2, 8, 10, 1]])

print(src.shape,target.shape)
model = Transformer(embed_dim=512, src_vocab_size=src_vocab_size,
target_vocab_size=target_vocab_size, seq_length=seq_length,
num_layers=num_layers, expansion_factor=4, n_heads=8)
model

torch.Size([2, 12]) torch.Size([2, 12])

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 8/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

Out[11]:

In [12]:

out = model(src, target)


out.shape

Out[12]:

torch.Size([2, 12, 11])

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 9/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

Transformer(
In [13]:
(encoder): TransformerEncoder(
model(embedding_layer): Embedding( src_vocab_size=src_vocab_size,
= Transformer(embed_dim=512,
(embed): Embedding(11, 512)
target_vocab_size=target_vocab_size, seq_length=seq_length,
) num_layers=num_layers, expansion_factor=4, n_heads=8)
(positional_encoder): PositionalEmbedding()
(layers): ModuleList(
(0-5): 6 x TransformerBlock(
(attention): MultiHeadAttention(
src = torch.tensor([[0, 2, 5, 6, 4, 3, 9, 5, 2, 9, 10, 1]])
(query_matrix): Linear(in_features=64, out_features=64, bias=Fal
trg = torch.tensor([[0]])
se)
print(src.shape,trg.shape)
(key_matrix): trg)
out = model.decode(src, Linear(in_features=64, out_features=64, bias=Fals
e)
out
(value_matrix): Linear(in_features=64, out_features=64, bias=Fal
torch.Size([1,
se) 12]) torch.Size([1, 1])
(out): Linear(in_features=512, out_features=512, bias=True)
Out[13]:
)
[0, 0, 0,(norm1): LayerNorm((512,),
0, 0, 0, eps=1e-05, elementwise_affine=True)
0, 0, 0, 0, 0, 0]
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(feed_forward): Sequential(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): ReLU()
(2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout1): Dropout(p=0.2, inplace=False)
(dropout2): Dropout(p=0.2, inplace=False)

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 10/10

You might also like