Transformers Implementations 1731410319
Transformers Implementations 1731410319
class MultiHeadAttention(nn.Module):
def __init__(self,d_model,num_heads):
super(MultiHeadAttention,self).__init__()
assert d_model % num_heads == 0 # d_model must be divisible by num_of heads
self.d_model = d_model
self.num_heads = num_heads
self.d_k = d_model // num_heads
self.W_q = nn.Linear(d_model,d_model)
self.W_k = nn.Linear(d_model,d_model)
self.W_v = nn.Linear(d_model,d_model)
self.W_o = nn.Linear(d_model,d_model)
def scaled_dot_product_attention(self,Q,K,V,mask=None):
attn_scores = torch.matmul(Q,K.transpose(-2,-1)) / math.sqrt(self.d_k)
if mask is not None:
attn_scores = attn_scores.masked_fill(mask==0, -1e9)
attn_probs = torch.softmax(attn_scores,dim=-1)
output = torch.matmul(attn_probs,V)
return output
def split_heads(self,x):
batch_size,seq_length,d_model = x.size()
return x.view(batch_size, seq_length, self.num_heads, self.d_k).transpose(1, 2)
def combine_heads(self,x):
batch_size,_,seq_length,d_k = x.size()
return x.transpose(1,2).contiguous().view(batch_size,seq_length,self.d_model)
def forward(self,Q,K,V,mask=None):
Q = self.split_heads(self.W_q(Q))
K = self.split_heads(self.W_k(K))
V = self.split_heads(self.W_v(V))
attn_output = self.scaled_dot_product_attention(Q,K,V,mask)
output = self.W_o(self.combine_heads(attn_output))
return output
The MultiHeadAttention code initializes the module with input parameters and linear
transformation layers. It calculates attention scores, reshapes the input tensor into multiple
heads, and combines the attention outputs from all heads. The forward method computes the
multi-head self-attention, allowing the model to focus on some different aspects of the input
sequence.
def forward(self,x):
return self.fc2(self.relu(self.fc1(x)))
pe = torch.zeros(max_seq_length,d_model)
position = torch.arange(0,max_seq_length,dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0,d_model,2).float() * -(math.log(10000.0) / d_mod
self.register_buffer('pe',pe.unsqueeze(0))
def forward(self,x):
return x + self.pe[:, :x.size(1)]
The PositionalEncoding class initializes with input parameters d_model and max_seq_length,
creating a tensor to store positional encoding values. The class calculates sine and cosine
values for even and odd indices, respectively, based on the scaling factor div_term. The
forward method computes the positional encoding by adding the stored positional encoding
values to the input tensor, allowing the model to capture the position information of the input
sequence.
class EncoderLayer(nn.Module):
def __init__(self,d_model,num_heads,d_ff,dropout):
super(EncoderLayer,self).__init__()
self.self_attn = MultiHeadAttention(d_model,num_heads)
self.feed_forward = PositionWiseFeedForrward(d_model,d_ff)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self,x,mask):
attn_output = self.self_attn(x,x,x,mask)
x=self.norm1(x+self.dropout(attn_output))
ff_output = self.feed_forward(x)
x=self.norm2(x+self.dropout(ff_output))
return x
The EncoderLayer class initializes with input parameters and components, including a
MultiHeadAttention module, a PositionWiseFeedForward module, two layer normalization
modules, and a dropout layer. The forward methods computes the encoder layer output by
applying self-attention, adding the attention output to the input tensor, and normalizing the
result. Then, it computes the position-wise feed-forward output, combines it with the
normalized self-attention output, and normalizes the final result before returning the
processed tensor.
keyboard_arrow_down Decoder Layer
class DecoderLayer(nn.Module):
def __init__(self,d_model,num_heads,d_ff,dropout):
super(DecoderLayer,self).__init__()
self.self_attn = MultiHeadAttention(d_model,num_heads)
self.cross_attn = MultiHeadAttention(d_model,num_heads)
self.feed_forward = PositionWiseFeedForrward(d_model,d_ff)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout = nn.Dropout(dropout)
def forward(self,x,enc_output,src_mask,tgt_mask):
attn_output = self.self_attn(x,x,x,tgt_mask)
x = self.norm1(x + self.dropout(attn_output))
attn_output = self.cross_attn(x,enc_output,enc_output,src_mask)
x = self.norm2(x + self.dropout(attn_output))
ff_output = self.feed_forward(x)
x = self.norm3(x + self.dropout(ff_output))
return x
The forward method computes the decoder layer output by performing the following steps:
1. Calculate the masked self-attention output and add it to the input tensor, followed by
dropout and layer normalization.
2. Compute the cross-attention output between the decoder and encoder outputs, and add it
to the normalized masked self-attention output, followed by dropout and layer
normalization.
3. Calculate the position-wise feed-forward output and combine it with the normalized
cross-attention output, followed by dropout and layer normalization.
These operations enable the decoder to generate target sequences based on the input and the
encoder output.
Now Let's combine the Encoder and Decoder layers to create the
complete transformer model.
class Transformer(nn.Module):
def __init__(self,src_vocab_size,tgt_vocab_size,d_model,num_heads,num_layers,d_ff,max_
super(Transformer,self).__init__()
self.encoder_embedding = nn.Embedding(src_vocab_size,d_model)
self.decoder_embedding = nn.Embedding(tgt_vocab_size,d_model)
self.positional_encoding = PositionalEncoding(d_model,max_seq_length)
self.encoder_layers = nn.ModuleList([EncoderLayer(d_model,num_heads,d_ff,dropout) fo
self.decoder_layers = nn.ModuleList([DecoderLayer(d_model,num_heads,d_ff,dropout) fo
self.fc = nn.Linear(d_model,tgt_vocab_size)
self.dropout = nn.Dropout(dropout)
def generate_mask(self,src,tgt):
src_mask = (src !=0).unsqueeze(1).unsqueeze(2)
tgt_mask = (tgt !=0).unsqueeze(1).unsqueeze(3)
seq_length = tgt.size(1)
nopeak_mask = (1 - torch.triu(torch.ones(1,seq_length,seq_length),diagonal=1)).bool(
tgt_mask = tgt_mask & nopeak_mask
return src_mask,tgt_mask
def forward(self,src,tgt):
src_mask,tgt_mask = self.generate_mask(src,tgt)
src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))
enc_output = src_embedded
for enc_layer in self.encoder_layers:
enc_output = enc_layer(enc_output,src_mask)
dec_output = tgt_embedded
for dec_layer in self.decoder_layers:
dec_output = dec_layer(dec_output,enc_output,src_mask, tgt_mask)
output = self.fc(dec_output)
return output
The Transformer class combines the previously defined modules to create a complete
Transformer model. During initialization, the Transformer module sets up input parameters and
initializes various components, including embedding layers for source and target sequences, a
PositionalEncoding module, EncoderLayer and DecoderLayer modules to create stacked layers,
a linear layer for projecting decoder output, and a dropout layer.
The generate_mask method creates binary masks for source and target sequences to ignore
padding tokens and prevent the decoder from attending to future tokens. The forward method
computes the Transformer model’s output through the following steps:
2. Compute source and target embeddings, and apply positional encoding and dropout.
3. Process the source sequence through encoder layers, updating the enc_output tensor.
4. Process the target sequence through decoder layers, using enc_output and masks, and
updating the dec_output tensor.
5. Apply the linear projection layer to the decoder output, obtaining output logits.
These steps enable the Transformer model to process input sequences and generate output
sequences based on the combined functionality of its components.
transformer = Transformer(src_vocab_size,tgt_vocab_size,d_model,num_heads,num_layers,d_f
print(src_data.shape)
print(tgt_data.shape)
torch.Size([64, 100])
torch.Size([64, 100])
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(),lr=0.001,betas=(0.9,0.98),eps=1e-9)
transformer.train()