0% found this document useful (0 votes)

9 views10 pages

NLP 4

The document is a Jupyter Notebook implementing a Transformer model for Natural Language Processing using PyTorch. It includes classes for embedding, positional encoding, multi-head attention, and both encoder and decoder blocks, culminating in a complete Transformer architecture. The notebook also demonstrates the model's initialization and a sample input-output operation.

Uploaded by

Pratiksha Darade

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

9 views10 pages

NLP 4

Uploaded by

Pratiksha Darade

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 10

4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [1]:

import torch.nn as nn
import torch
import torch.nn.functional as F
import math,copy,re
import warnings
import pandas as pd
import numpy as np
import seaborn as sns
import torchtext
import matplotlib.pyplot as plt
warnings.simplefilter("ignore")
print(torch.__version__)

2.0.0+cu118

In [2]:

class Embedding(nn.Module):
def __init__(self, vocab_size, embed_dim):
"""
Args:
vocab_size: size of vocabulary
embed_dim: dimension of embeddings
"""
super(Embedding, self).__init__()
self.embed = nn.Embedding(vocab_size, embed_dim)
def forward(self, x):
"""
Args:
x: input vector
Returns:
out: embedding vector
"""
out = self.embed(x)
return out

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 1/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [4]:

class PositionalEmbedding(nn.Module):
def __init__(self,max_seq_len,embed_model_dim):
"""
Args:
seq_len: length of input sequence
embed_model_dim: demension of embedding
"""
super(PositionalEmbedding, self).__init__()
self.embed_dim = embed_model_dim

pe = torch.zeros(max_seq_len,self.embed_dim)
for pos in range(max_seq_len):
for i in range(0,self.embed_dim,2):
pe[pos, i] = math.sin(pos / (10000 ** ((2 * i)/self.embed_dim)))
pe[pos, i + 1] = math.cos(pos / (10000 ** ((2 * (i + 1))/self.embed_dim))
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)

def forward(self, x):

"""
Args:
x: input vector
Returns:
x: output
"""
x = x * math.sqrt(self.embed_dim)
#add constant to embedding
seq_len = x.size(1)
x = x + torch.autograd.Variable(self.pe[:,:seq_len], requires_grad=False)
return x

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 2/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 3/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [5]:

class MultiHeadAttention(nn.Module):
def __init__(self, embed_dim=512, n_heads=8):
"""
Args:
embed_dim: dimension of embeding vector output
n_heads: number of self attention heads
"""
super(MultiHeadAttention, self).__init__()

self.embed_dim = embed_dim #512 dim

self.n_heads = n_heads #8
self.single_head_dim = int(self.embed_dim / self.n_heads)
self.query_matrix = nn.Linear(self.single_head_dim , self.single_head_dim ,bias=F
self.key_matrix = nn.Linear(self.single_head_dim , self.single_head_dim, bias=Fa
self.value_matrix = nn.Linear(self.single_head_dim ,self.single_head_dim , bias=F
self.out = nn.Linear(self.n_heads*self.single_head_dim ,self.embed_dim)

def forward(self,key,query,value,mask=None):

"""
Args:
key : key vector
query : query vector
value : value vector
mask: mask for decoder

Returns:
output vector from multihead attention
"""
batch_size = key.size(0)
seq_length = key.size(1)
seq_length_query = query.size(1)

# 32x10x512
key = key.view(batch_size, seq_length, self.n_heads, self.single_head_dim) #batc
query = query.view(batch_size, seq_length_query, self.n_heads, self.single_head_d
value = value.view(batch_size, seq_length, self.n_heads, self.single_head_dim) #(

k = self.key_matrix(key) # (32x10x8x64)
q = self.query_matrix(query)
v = self.value_matrix(value)
q = q.transpose(1,2) # (batch_size, n_heads, seq_len, single_head_dim) # (32
k = k.transpose(1,2) # (batch_size, n_heads, seq_len, single_head_dim)
v = v.transpose(1,2) # (batch_size, n_heads, seq_len, single_head_dim)

# computes attention
# adjust key for matrix multiplication
k_adjusted = k.transpose(-1,-2) #(batch_size, n_heads, single_head_dim, seq_ken)
product = torch.matmul(q, k_adjusted) #(32 x 8 x 10 x 64) x (32 x 8 x 64 x 10) =
if mask is not None:
product = product.masked_fill(mask == 0, float("-1e20"))

#divising by square root of key dimension

product = product / math.sqrt(self.single_head_dim) # / sqrt(64)

#applying softmax
scores = F.softmax(product, dim=-1)

#mutiply with value matrix

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 4/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

scores = torch.matmul(scores, v) ##(32x8x 10x 10) x (32 x 8 x 10 x 64) = (32 x 8

concat = scores.transpose(1,2).contiguous().view(batch_size, seq_length_query, se

output = self.out(concat) #(32,10,512) -> (32,10,512)

return output

In [8]:

class TransformerBlock(nn.Module):
def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
super(TransformerBlock, self).__init__()
self.attention = MultiHeadAttention(embed_dim, n_heads)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.feed_forward = nn.Sequential(nn.Linear(embed_dim, expansion_factor*embed_dim),nn
self.dropout1 = nn.Dropout(0.2)
self.dropout2 = nn.Dropout(0.2)

def forward(self,key,query,value):
attention_out = self.attention(key,query,value) #32x10x512
attention_residual_out = attention_out + value #32x10x512
norm1_out = self.dropout1(self.norm1(attention_residual_out)) #32x10x512
feed_fwd_out = self.feed_forward(norm1_out) #32x10x512 -> #32x10x2048 -> 32x10x512
feed_fwd_residual_out = feed_fwd_out + norm1_out #32x10x512
norm2_out = self.dropout2(self.norm2(feed_fwd_residual_out)) #32x10x512
return norm2_out

class TransformerEncoder(nn.Module):
def __init__(self, seq_len, vocab_size, embed_dim, num_layers=2, expansion_factor=4, n_
super(TransformerEncoder, self).__init__()
self.embedding_layer = Embedding(vocab_size, embed_dim)
self.positional_encoder = PositionalEmbedding(seq_len, embed_dim)
self.layers = nn.ModuleList([TransformerBlock(embed_dim, expansion_factor, n_heads) f

def forward(self, x):

embed_out = self.embedding_layer(x)
out = self.positional_encoder(embed_out)
for layer in self.layers:
out = layer(out,out,out)
return out

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 5/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [9]:

class DecoderBlock(nn.Module):
def __init__(self, embed_dim, expansion_factor=4, n_heads=8):
super(DecoderBlock, self).__init__()
self.attention = MultiHeadAttention(embed_dim, n_heads=8)
self.norm = nn.LayerNorm(embed_dim)
self.dropout = nn.Dropout(0.2)
self.transformer_block = TransformerBlock(embed_dim, expansion_factor, n_heads)
def forward(self, key, query, x,mask):
attention = self.attention(x,x,x,mask=mask) #32x10x512
value = self.dropout(self.norm(attention + x))
out = self.transformer_block(key, query, value)
return out

class TransformerDecoder(nn.Module):
def __init__(self, target_vocab_size, embed_dim, seq_len, num_layers=2, expansion_facto
super(TransformerDecoder, self).__init__()
self.word_embedding = nn.Embedding(target_vocab_size, embed_dim)
self.position_embedding = PositionalEmbedding(seq_len, embed_dim)
self.layers = nn.ModuleList([DecoderBlock(embed_dim, expansion_factor=4, n_heads=8) f
self.fc_out = nn.Linear(embed_dim, target_vocab_size)
self.dropout = nn.Dropout(0.2)
def forward(self, x, enc_out, mask):
x = self.word_embedding(x) #32x10x512
x = self.position_embedding(x) #32x10x512
x = self.dropout(x)

for layer in self.layers:

x = layer(enc_out, x, enc_out, mask)
out = F.softmax(self.fc_out(x))
return out

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 6/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [10]:

class Transformer(nn.Module):
def __init__(self, embed_dim, src_vocab_size, target_vocab_size, seq_length,num_layer
super(Transformer, self).__init__()
self.target_vocab_size = target_vocab_size

self.encoder = TransformerEncoder(seq_length, src_vocab_size, embed_dim, num_laye

self.decoder = TransformerDecoder(target_vocab_size, embed_dim, seq_length, num_l
def make_trg_mask(self, trg):
batch_size, trg_len = trg.shape
trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(batch_size, 1, trg_len
return trg_mask
def decode(self,src,trg):
trg_mask = self.make_trg_mask(trg)
enc_out = self.encoder(src)
out_labels = []
batch_size,seq_len = src.shape[0],src.shape[1]
out = trg
for i in range(seq_len):
out = self.decoder(out,enc_out,trg_mask) #bs x seq_len x vocab_dim
# taking the last token
out = out[:,-1,:]

out = out.argmax(-1)
out_labels.append(out.item())
out = torch.unsqueeze(out,axis=0)
return out_labels
def forward(self, src, trg):
trg_mask = self.make_trg_mask(trg)
enc_out = self.encoder(src)

outputs = self.decoder(trg, enc_out, trg_mask)

return outputs

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 7/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

In [11]:

src_vocab_size = 11
target_vocab_size = 11
num_layers = 6
seq_length= 12

# let 0 be sos token and 1 be eos token

src = torch.tensor([[0, 2, 5, 6, 4, 3, 9, 5, 2, 9, 10, 1],
[0, 2, 8, 7, 3, 4, 5, 6, 7, 2, 10, 1]])
target = torch.tensor([[0, 1, 7, 4, 3, 5, 9, 2, 8, 10, 9, 1],
[0, 1, 5, 6, 2, 4, 7, 6, 2, 8, 10, 1]])

print(src.shape,target.shape)
model = Transformer(embed_dim=512, src_vocab_size=src_vocab_size,
target_vocab_size=target_vocab_size, seq_length=seq_length,
num_layers=num_layers, expansion_factor=4, n_heads=8)
model

torch.Size([2, 12]) torch.Size([2, 12])

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 8/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

Out[11]:

In [12]:

out = model(src, target)

out.shape

Out[12]:

torch.Size([2, 12, 11])

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 9/10
4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

Transformer(
In [13]:
(encoder): TransformerEncoder(
model(embedding_layer): Embedding( src_vocab_size=src_vocab_size,
= Transformer(embed_dim=512,
(embed): Embedding(11, 512)
target_vocab_size=target_vocab_size, seq_length=seq_length,
) num_layers=num_layers, expansion_factor=4, n_heads=8)
(positional_encoder): PositionalEmbedding()
(layers): ModuleList(
(0-5): 6 x TransformerBlock(
(attention): MultiHeadAttention(
src = torch.tensor([[0, 2, 5, 6, 4, 3, 9, 5, 2, 9, 10, 1]])
(query_matrix): Linear(in_features=64, out_features=64, bias=Fal
trg = torch.tensor([[0]])
se)
print(src.shape,trg.shape)
(key_matrix): trg)
out = model.decode(src, Linear(in_features=64, out_features=64, bias=Fals
e)
out
(value_matrix): Linear(in_features=64, out_features=64, bias=Fal
torch.Size([1,
se) 12]) torch.Size([1, 1])
(out): Linear(in_features=512, out_features=512, bias=True)
Out[13]:
)
[0, 0, 0,(norm1): LayerNorm((512,),
0, 0, 0, eps=1e-05, elementwise_affine=True)
0, 0, 0, 0, 0, 0]
(norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
(feed_forward): Sequential(
(0): Linear(in_features=512, out_features=2048, bias=True)
(1): ReLU()
(2): Linear(in_features=2048, out_features=512, bias=True)
)
(dropout1): Dropout(p=0.2, inplace=False)
(dropout2): Dropout(p=0.2, inplace=False)

localhost:8888/notebooks/Downloads/Semester_8/NLP_Practicals/Transformer_ASSIGN_4_NLP.ipynb 10/10

Pile Type 1 - Screw Pile Load Test Outline (Terna)
No ratings yet
Pile Type 1 - Screw Pile Load Test Outline (Terna)
113 pages
MCQ Hot Air Oven
No ratings yet
MCQ Hot Air Oven
15 pages
Handbook Rheometer
No ratings yet
Handbook Rheometer
328 pages
The Annotated Transformer
No ratings yet
The Annotated Transformer
59 pages
Formulation and Evaluation of Topical Herbal Gel For The Treatment
No ratings yet
Formulation and Evaluation of Topical Herbal Gel For The Treatment
16 pages
LSTM From Scratch in Python
No ratings yet
LSTM From Scratch in Python
11 pages
Shop Manual PC27MRX1 PC30MRX1 PC35MRX1 PC40MRX1 PC45MRX1
No ratings yet
Shop Manual PC27MRX1 PC30MRX1 PC35MRX1 PC40MRX1 PC45MRX1
946 pages
GPT4 Architecture
No ratings yet
GPT4 Architecture
2 pages
Caie Igcse Mathematics Theory Znotes
No ratings yet
Caie Igcse Mathematics Theory Znotes
21 pages
Maths Test 1 PDF
No ratings yet
Maths Test 1 PDF
2 pages
Transformers Torch
No ratings yet
Transformers Torch
38 pages
AE556 2024 Topic7 Transformer
No ratings yet
AE556 2024 Topic7 Transformer
49 pages
Engine Table of Contents
No ratings yet
Engine Table of Contents
248 pages
Face Recognition - Ipynb
No ratings yet
Face Recognition - Ipynb
128 pages
Transformer
No ratings yet
Transformer
58 pages
NLP Week8 Transformers
No ratings yet
NLP Week8 Transformers
66 pages
Font Transfer 2 Autoencoders
No ratings yet
Font Transfer 2 Autoencoders
78 pages
CNNs and Transformers
No ratings yet
CNNs and Transformers
90 pages
Infobasic Programming and T-24 Standard
No ratings yet
Infobasic Programming and T-24 Standard
7 pages
Chapter 2
No ratings yet
Chapter 2
52 pages
Computer Project
No ratings yet
Computer Project
59 pages
Downloaded by R GAYATHRI (R.gayathri@aalimec - Ac.in)
No ratings yet
Downloaded by R GAYATHRI (R.gayathri@aalimec - Ac.in)
56 pages
Final Analytical Cement Mortar Grouting Exceed BOQ Report 11.03.2024 From TL
No ratings yet
Final Analytical Cement Mortar Grouting Exceed BOQ Report 11.03.2024 From TL
55 pages
Lecture Notes - Advanced Language Model - BERT, GPT
No ratings yet
Lecture Notes - Advanced Language Model - BERT, GPT
24 pages
Ad3511-Deep Learning-Lab Manual
No ratings yet
Ad3511-Deep Learning-Lab Manual
53 pages
Bahdanau Attention Mechanism (Also Known As Additive Attention)
No ratings yet
Bahdanau Attention Mechanism (Also Known As Additive Attention)
41 pages
Agilent 54622D Oscilloscope Service
No ratings yet
Agilent 54622D Oscilloscope Service
118 pages
Computer Vision 11 Transformers
No ratings yet
Computer Vision 11 Transformers
63 pages
495 Lecture 10 Attall
No ratings yet
495 Lecture 10 Attall
18 pages
Project Source
No ratings yet
Project Source
21 pages
Position Encoding: Intuition Lack Inherent Word Order Awareness
No ratings yet
Position Encoding: Intuition Lack Inherent Word Order Awareness
33 pages
Anlp 05 Transformers
No ratings yet
Anlp 05 Transformers
40 pages
Modeling Chatglm
No ratings yet
Modeling Chatglm
20 pages
Astro AI
No ratings yet
Astro AI
20 pages
Coding Attention Mechanisms
No ratings yet
Coding Attention Mechanisms
24 pages
Flux Motor 2018
No ratings yet
Flux Motor 2018
29 pages
1-Tac-12csu Tbfi1 Test Report
No ratings yet
1-Tac-12csu Tbfi1 Test Report
15 pages
Variational AutoEncoders (VAE) With PyTorch - Alexander Van de Kleut
No ratings yet
Variational AutoEncoders (VAE) With PyTorch - Alexander Van de Kleut
17 pages
PyTorch Cheat Sheet & Quick Reference
No ratings yet
PyTorch Cheat Sheet & Quick Reference
6 pages
GPT2 From Scratch in PyTorch
No ratings yet
GPT2 From Scratch in PyTorch
13 pages
AI Lab6
No ratings yet
AI Lab6
22 pages
Prudent Race Engineering OL Brochure - 2nd Edition
No ratings yet
Prudent Race Engineering OL Brochure - 2nd Edition
9 pages
Propeller Shaft
No ratings yet
Propeller Shaft
13 pages
NLP 8
No ratings yet
NLP 8
42 pages
Exp 6,7,8
No ratings yet
Exp 6,7,8
17 pages
Decoder-Only Transformer (LLM) For Question Asking: Notebook Structure
No ratings yet
Decoder-Only Transformer (LLM) For Question Asking: Notebook Structure
9 pages
Correct The Error
No ratings yet
Correct The Error
11 pages
Transformers Implementations 1731410319
No ratings yet
Transformers Implementations 1731410319
10 pages
Final Demonstration LP
No ratings yet
Final Demonstration LP
12 pages
Assignment No 4
No ratings yet
Assignment No 4
8 pages
NLP
No ratings yet
NLP
15 pages
Solved Example of Transformers
No ratings yet
Solved Example of Transformers
20 pages
DAA FinalReport
No ratings yet
DAA FinalReport
14 pages
Transformer Flux
No ratings yet
Transformer Flux
11 pages
Exp 8 Machine Translation
No ratings yet
Exp 8 Machine Translation
11 pages
2344-Article Text-4603-1-10-20230113
No ratings yet
2344-Article Text-4603-1-10-20230113
8 pages
LLM Code Ref
No ratings yet
LLM Code Ref
10 pages
Capacity Design
No ratings yet
Capacity Design
12 pages
Mlp-Fromscratch Sigmoid-Mse
No ratings yet
Mlp-Fromscratch Sigmoid-Mse
13 pages
Transformer
No ratings yet
Transformer
5 pages
TXT
No ratings yet
TXT
7 pages
Pytorch Demo 1749471354
No ratings yet
Pytorch Demo 1749471354
10 pages
Karpathy MinGPT Model
No ratings yet
Karpathy MinGPT Model
7 pages
Transformers
No ratings yet
Transformers
15 pages
Transformer
No ratings yet
Transformer
10 pages
Lab 5
No ratings yet
Lab 5
7 pages
Chapter 6 HW Packet
No ratings yet
Chapter 6 HW Packet
19 pages
A4
No ratings yet
A4
8 pages
Transformer 2
No ratings yet
Transformer 2
6 pages
Code File
No ratings yet
Code File
6 pages
Homework IntroToDL
No ratings yet
Homework IntroToDL
3 pages
Lesson 2
No ratings yet
Lesson 2
8 pages
EncoderDecoderSeq2Seq DeepLSTM
No ratings yet
EncoderDecoderSeq2Seq DeepLSTM
7 pages
Assignment 9
No ratings yet
Assignment 9
4 pages
Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory
No ratings yet
Course 3 - Week 2 - Exercise - Answer - Ipynb - Colaboratory
8 pages
Book Review of Lewis Vaughn's "The Power of Critical Thinking"
No ratings yet
Book Review of Lewis Vaughn's "The Power of Critical Thinking"
6 pages
Word 2 Vec
No ratings yet
Word 2 Vec
3 pages
Nummerical and Simulation Methods For Calculation of Dynamical Transient Characteristics of Squirrel Cage Induction Motor
No ratings yet
Nummerical and Simulation Methods For Calculation of Dynamical Transient Characteristics of Squirrel Cage Induction Motor
4 pages
Nastran Shell Element Orientation Question
No ratings yet
Nastran Shell Element Orientation Question
3 pages
RNN Text Generation
No ratings yet
RNN Text Generation
3 pages
Discriminant Analysis
No ratings yet
Discriminant Analysis
5 pages
GPT 2 - Learninhg 3
No ratings yet
GPT 2 - Learninhg 3
2 pages
Assignment 8
No ratings yet
Assignment 8
2 pages
Acknowledgement Abstract
No ratings yet
Acknowledgement Abstract
6 pages
Siemens 1LA7 Cat 48
No ratings yet
Siemens 1LA7 Cat 48
1 page
Form Substation+400V+Switchboard+Test+Form
No ratings yet
Form Substation+400V+Switchboard+Test+Form
2 pages
Basic Principles and Practices in CC1 1
No ratings yet
Basic Principles and Practices in CC1 1
2 pages
Amc Book 1 2018 Secure
100% (7)
Amc Book 1 2018 Secure
275 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet
Nmrws2: H O That Are Aldehydes
No ratings yet
Nmrws2: H O That Are Aldehydes
4 pages

NLP 4

Uploaded by

NLP 4

Uploaded by

4/18/23, 9:29 PM Transformer_ASSIGN_4_NLP - Jupyter Notebook

def forward(self, x):

self.embed_dim = embed_dim #512 dim

#divising by square root of key dimension

#mutiply with value matrix

scores = torch.matmul(scores, v) ##(32x8x 10x 10) x (32 x 8 x 10 x 64) = (32 x 8

output = self.out(concat) #(32,10,512) -> (32,10,512)

def forward(self, x):

for layer in self.layers:

self.encoder = TransformerEncoder(seq_length, src_vocab_size, embed_dim, num_laye

outputs = self.decoder(trg, enc_out, trg_mask)

# let 0 be sos token and 1 be eos token

torch.Size([2, 12]) torch.Size([2, 12])

out = model(src, target)

torch.Size([2, 12, 11])

You might also like