0% found this document useful (0 votes)
2 views10 pages

Transformer

This document outlines the installation and implementation of a Transformer model using TensorFlow and Keras, including data preprocessing, tokenization, and the construction of various layers such as Encoder, Decoder, and attention mechanisms. It also includes the definition of a custom learning rate schedule for training the model. The document concludes with the instantiation of the Transformer model and the setup of the optimizer.

Uploaded by

salina karki
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views10 pages

Transformer

This document outlines the installation and implementation of a Transformer model using TensorFlow and Keras, including data preprocessing, tokenization, and the construction of various layers such as Encoder, Decoder, and attention mechanisms. It also includes the definition of a custom learning rate schedule for training the model. The document concludes with the instantiation of the Transformer model and the setup of the optimizer.

Uploaded by

salina karki
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 10

#---------------------installation---------

!apt install --allow-change-held-packages libcudnn8=8.1.0.77-1+cuda11.2


!pip uninstall -y -q tensorflow keras tensorflow-estimator tensorflow-text
!pip install protobuf~=3.20.3
!pip install -q tensorflow_datasets
!pip install -q -U tensorflow-text tensorflow

#-------------------import-----------------
import logging
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds


import tensorflow as tf

import tensorflow_text
import pandas as pd

#--------------- Load the CSV data-----------------


df = pd.read_csv("your_dataset.csv")
# Assuming "Message" and "Reply" are the columns in your CSV
messages = df["Message"].tolist()
replies = df["Reply"].tolist()

# Create a tokenizer for messages


message_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
message_tokenizer.fit_on_texts(messages)

# Create a tokenizer for replies


reply_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
reply_tokenizer.fit_on_texts(replies)

# Tokenize the messages and replies


tokenized_messages = message_tokenizer.texts_to_sequences(messages)
tokenized_replies = reply_tokenizer.texts_to_sequences(replies)

# Pad sequences to make them of equal length


padded_messages = tf.keras.preprocessing.sequence.pad_sequences(tokenized_messages,
padding='post')
padded_replies = tf.keras.preprocessing.sequence.pad_sequences(tokenized_replies,
padding='post')
-----------------------------------------------------------------------------------
-------------------------------------------------------
# ----Tokenize the data using your tokenizer (assuming you already have a
tokenizer)
tokenized_messages = [tokenizers.en.tokenize(message.numpy()) for message in
df["Messege"]]
tokenized_replies = [tokenizers.en.tokenize(reply.numpy()) for reply in
df["Reply"]]

dataset = tf.data.Dataset.from_tensor_slices((tokenized_messages,
tokenized_replies))

# Pad and batch the dataset


batch_size = 64 # Adjust the batch size as needed
train_batches = dataset.shuffle(len(df)).padded_batch(batch_size)
# Split the dataset into training and validation sets
train_size = int(0.8 * len(df))
train_df = df[:train_size]
val_df = df[train_size:]

# Create tf.data.Dataset for training


train_dataset =
tf.data.Dataset.from_tensor_slices((tokenized_messages[:train_size],
tokenized_replies[:train_size]))
train_batches = train_dataset.shuffle(train_size).padded_batch(batch_size)

# Create tf.data.Dataset for validation


val_dataset = tf.data.Dataset.from_tensor_slices((tokenized_messages[train_size:],
tokenized_replies[train_size:]))
val_batches = val_dataset.padded_batch(batch_size)
-----------------------------------------------------------------------------------
-------------------------------------------------------
# Combine the tokenized sequences into a tuple
dataset = tf.data.Dataset.from_tensor_slices((padded_messages, padded_replies))

# Split the dataset into training and validation sets


train_size = int(0.8 * len(padded_messages))
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

# Batch the datasets


batch_size = 64
train_batches = train_dataset.shuffle(train_size).batch(batch_size,
drop_remainder=True)
val_batches = val_dataset.batch(batch_size, drop_remainder=True)

#-------------------Positional encoding----------------------------
def positional_encoding(length, depth):
depth = depth/2

positions = np.arange(length)[:, np.newaxis] # (seq, 1)


depths = np.arange(depth)[np.newaxis, :]/depth # (1, depth)

angle_rates = 1 / (10000**depths) # (1, depth)


angle_rads = positions * angle_rates # (pos, depth)

pos_encoding = np.concatenate(
[np.sin(angle_rads), np.cos(angle_rads)],
axis=-1)

return tf.cast(pos_encoding, dtype=tf.float32)

#---check of positional encoding


#@title
pos_encoding = positional_encoding(length=2048, depth=512)

# Check the shape.


print(pos_encoding.shape)

# Plot the dimensions.


plt.pcolormesh(pos_encoding.numpy().T, cmap='RdBu')
plt.ylabel('Depth')
plt.xlabel('Position')
plt.colorbar()
plt.show()

#@title
pos_encoding/=tf.norm(pos_encoding, axis=1, keepdims=True)
p = pos_encoding[1000]
dots = tf.einsum('pd,d -> p', pos_encoding, p)
plt.subplot(2,1,1)
plt.plot(dots)
plt.ylim([0,1])
plt.plot([950, 950, float('nan'), 1050, 1050],
[0,1,float('nan'),0,1], color='k', label='Zoom')
plt.legend()
plt.subplot(2,1,2)
plt.plot(dots)
plt.xlim([950, 1050])
plt.ylim([0,1])

#----------------------PositionalEmbedding--------
class PositionalEmbedding(tf.keras.layers.Layer):
def __init__(self, vocab_size, d_model):
super().__init__()
self.d_model = d_model
self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)

self.pos_encoding = positional_encoding(length=2048, depth=d_model)

def compute_mask(self, *args, **kwargs):


return self.embedding.compute_mask(*args, **kwargs)

def call(self, x):


length = tf.shape(x)[1]
x = self.embedding(x)
# This factor sets the relative scale of the embedding and positonal_encoding.
x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
x = x + self.pos_encoding[tf.newaxis, :length, :]
return x

embed_pt = PositionalEmbedding(vocab_size=tokenizers.pt.get_vocab_size(),
d_model=512)
embed_en = PositionalEmbedding(vocab_size=tokenizers.en.get_vocab_size(),
d_model=512)

pt_emb = embed_pt(pt)
en_emb = embed_en(en)
en_emb._keras_mask

#----------------------MultiheadHeadAttention &layer NOrmalization------

class BaseAttention(tf.keras.layers.Layer):
def __init__(self, **kwargs):
super().__init__()
self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
self.layernorm = tf.keras.layers.LayerNormalization()
self.add = tf.keras.layers.Add()
#---------------------CrossAttention Layer of decoder where inputs also come from
encoder----------------
class CrossAttention(BaseAttention):
def call(self, x, context):
attn_output, attn_scores = self.mha(
query=x,
key=context,
value=context,
return_attention_scores=True)

# Cache the attention scores for plotting later.


self.last_attn_scores = attn_scores

x = self.add([x, attn_output])
x = self.layernorm(x)

return x

#---test-Run
sample_ca = CrossAttention(num_heads=2, key_dim=512)

print(pt_emb.shape)
print(en_emb.shape)
print(sample_ca(en_emb, pt_emb).shape)

#--------------------GlobalSelfAttention 0f first part of encoder---


class GlobalSelfAttention(BaseAttention):
def call(self, x):
attn_output = self.mha(
query=x,
value=x,
key=x)
x = self.add([x, attn_output])
x = self.layernorm(x)
return x
#----test
sample_gsa = GlobalSelfAttention(num_heads=2, key_dim=512)

print(pt_emb.shape)
print(sample_gsa(pt_emb).shape)

#--------------CausalSelfAttention---------
class CausalSelfAttention(BaseAttention):
def call(self, x):
attn_output = self.mha(
query=x,
value=x,
key=x,
use_causal_mask = True)
x = self.add([x, attn_output])
x = self.layernorm(x)
return x
#---test
sample_csa = CausalSelfAttention(num_heads=2, key_dim=512)

print(en_emb.shape)
print(sample_csa(en_emb).shape)
out1 = sample_csa(embed_en(en[:, :3]))
out2 = sample_csa(embed_en(en))[:, :3]

tf.reduce_max(abs(out1 - out2)).numpy()

#--------------------FeedForward Network----------
class FeedForward(tf.keras.layers.Layer):
def __init__(self, d_model, dff, dropout_rate=0.1):
super().__init__()
self.seq = tf.keras.Sequential([
tf.keras.layers.Dense(dff, activation='relu'),
tf.keras.layers.Dense(d_model),
tf.keras.layers.Dropout(dropout_rate)
])
self.add = tf.keras.layers.Add()
self.layer_norm = tf.keras.layers.LayerNormalization()

def call(self, x):


x = self.add([x, self.seq(x)])
x = self.layer_norm(x)
return x
#---test the layer
sample_ffn = FeedForward(512, 2048)

print(en_emb.shape)
print(sample_ffn(en_emb).shape)

#-----------------encoder Layer-------
class EncoderLayer(tf.keras.layers.Layer):
def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
super().__init__()

self.self_attention = GlobalSelfAttention(
num_heads=num_heads,
key_dim=d_model,
dropout=dropout_rate)

self.ffn = FeedForward(d_model, dff)

def call(self, x):


x = self.self_attention(x)
x = self.ffn(x)
return x

#---test
sample_encoder_layer = EncoderLayer(d_model=512, num_heads=8, dff=2048)

print(pt_emb.shape)
print(sample_encoder_layer(pt_emb).shape)

#---------------Encoder----------------------
class Encoder(tf.keras.layers.Layer):
def __init__(self, *, num_layers, d_model, num_heads,
dff, vocab_size, dropout_rate=0.1):
super().__init__()
self.d_model = d_model
self.num_layers = num_layers

self.pos_embedding = PositionalEmbedding(
vocab_size=vocab_size, d_model=d_model)

self.enc_layers = [
EncoderLayer(d_model=d_model,
num_heads=num_heads,
dff=dff,
dropout_rate=dropout_rate)
for _ in range(num_layers)]
self.dropout = tf.keras.layers.Dropout(dropout_rate)

def call(self, x):


# `x` is token-IDs shape: (batch, seq_len)
x = self.pos_embedding(x) # Shape `(batch_size, seq_len, d_model)`.

# Add dropout.
x = self.dropout(x)

for i in range(self.num_layers):
x = self.enc_layers[i](x)

return x # Shape `(batch_size, seq_len, d_model)`.


#---Instantiate the encoder.
sample_encoder = Encoder(num_layers=4,
d_model=512,
num_heads=8,
dff=2048,
vocab_size=8500)

sample_encoder_output = sample_encoder(pt, training=False)

#---Print the shape.


print(pt.shape)
print(sample_encoder_output.shape) # Shape `(batch_size, input_seq_len, d_model)`.

#------------------------Decoder Layer---------------------
class DecoderLayer(tf.keras.layers.Layer):
def __init__(self,
*,
d_model,
num_heads,
dff,
dropout_rate=0.1):
super(DecoderLayer, self).__init__()

self.causal_self_attention = CausalSelfAttention(
num_heads=num_heads,
key_dim=d_model,
dropout=dropout_rate)

self.cross_attention = CrossAttention(
num_heads=num_heads,
key_dim=d_model,
dropout=dropout_rate)
self.ffn = FeedForward(d_model, dff)

def call(self, x, context):


x = self.causal_self_attention(x=x)
x = self.cross_attention(x=x, context=context)

# Cache the last attention scores for plotting later


self.last_attn_scores = self.cross_attention.last_attn_scores

x = self.ffn(x) # Shape `(batch_size, seq_len, d_model)`.


return x

#----test
sample_decoder_layer = DecoderLayer(d_model=512, num_heads=8, dff=2048)

sample_decoder_layer_output = sample_decoder_layer(
x=en_emb, context=pt_emb)

print(en_emb.shape)
print(pt_emb.shape)
print(sample_decoder_layer_output.shape) # `(batch_size, seq_len, d_model)`

#-------------------------Decoder-----------
class Decoder(tf.keras.layers.Layer):
def __init__(self, *, num_layers, d_model, num_heads, dff, vocab_size,
dropout_rate=0.1):
super(Decoder, self).__init__()

self.d_model = d_model
self.num_layers = num_layers

self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size,
d_model=d_model)
self.dropout = tf.keras.layers.Dropout(dropout_rate)
self.dec_layers = [
DecoderLayer(d_model=d_model, num_heads=num_heads,
dff=dff, dropout_rate=dropout_rate)
for _ in range(num_layers)]

self.last_attn_scores = None

def call(self, x, context):


# `x` is token-IDs shape (batch, target_seq_len)
x = self.pos_embedding(x) # (batch_size, target_seq_len, d_model)

x = self.dropout(x)

for i in range(self.num_layers):
x = self.dec_layers[i](x, context)

self.last_attn_scores = self.dec_layers[-1].last_attn_scores

# The shape of x is (batch_size, target_seq_len, d_model).


return x

# Instantiate the decoder.


sample_decoder = Decoder(num_layers=4,
d_model=512,
num_heads=8,
dff=2048,
vocab_size=8000)

output = sample_decoder(
x=en,
context=pt_emb)

# Print the shapes.


print(en.shape)
print(pt_emb.shape)
print(output.shape)

sample_decoder.last_attn_scores.shape # (batch, heads, target_seq, input_seq)

#-------------------------Transformer-------------------
class Transformer(tf.keras.Model):
def __init__(self, *, num_layers, d_model, num_heads, dff,
input_vocab_size, target_vocab_size, dropout_rate=0.1):
super().__init__()
self.encoder = Encoder(num_layers=num_layers, d_model=d_model,
num_heads=num_heads, dff=dff,
vocab_size=input_vocab_size,
dropout_rate=dropout_rate)

self.decoder = Decoder(num_layers=num_layers, d_model=d_model,


num_heads=num_heads, dff=dff,
vocab_size=target_vocab_size,
dropout_rate=dropout_rate)

self.final_layer = tf.keras.layers.Dense(target_vocab_size)

def call(self, inputs):


# To use a Keras model with `.fit` you must pass all your inputs in the
# first argument.
context, x = inputs

context = self.encoder(context) # (batch_size, context_len, d_model)

x = self.decoder(x, context) # (batch_size, target_len, d_model)

# Final linear layer output.


logits = self.final_layer(x) # (batch_size, target_len, target_vocab_size)

try:
# Drop the keras mask, so it doesn't scale the losses/metrics.
# b/250038731
del logits._keras_mask
except AttributeError:
pass

# Return the final output and the attention weights.


return logits

#------------Hyperparameters------------
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

transformer = Transformer(
num_layers=num_layers,
d_model=d_model,
num_heads=num_heads,
dff=dff,
input_vocab_size=tokenizers.pt.get_vocab_size().numpy(),
target_vocab_size=tokenizers.en.get_vocab_size().numpy(),
dropout_rate=dropout_rate)

#---test
output = transformer((pt, en))

print(en.shape)
print(pt.shape)
print(output.shape)

attn_scores = transformer.decoder.dec_layers[-1].last_attn_scores
print(attn_scores.shape) # (batch, heads, target_seq, input_seq)

transformer.summary()

#--------------------setup Optimizer----------
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
def __init__(self, d_model, warmup_steps=4000):
super().__init__()

self.d_model = d_model
self.d_model = tf.cast(self.d_model, tf.float32)

self.warmup_steps = warmup_steps

def __call__(self, step):


step = tf.cast(step, dtype=tf.float32)
arg1 = tf.math.rsqrt(step)
arg2 = step * (self.warmup_steps ** -1.5)

return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

#------instantiate the optimizer-----


learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,


epsilon=1e-9)

plt.plot(learning_rate(tf.range(40000, dtype=tf.float32)))
plt.ylabel('Learning Rate')
plt.xlabel('Train Step')
#--------------------setup the loss metrices----
def masked_loss(label, pred):
mask = label != 0
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True, reduction='none')
loss = loss_object(label, pred)

mask = tf.cast(mask, dtype=loss.dtype)


loss *= mask

loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
return loss

def masked_accuracy(label, pred):


pred = tf.argmax(pred, axis=2)
label = tf.cast(label, pred.dtype)
match = label == pred

mask = label != 0

match = match & mask

match = tf.cast(match, dtype=tf.float32)


mask = tf.cast(mask, dtype=tf.float32)
return tf.reduce_sum(match)/tf.reduce_sum(mask)

#----------------train the model----------


transformer.compile(
loss=masked_loss,
optimizer=optimizer,
metrics=[masked_accuracy])]
transformer.fit(train_batches,
epochs=20,
validation_data=val_batches,verbose=1)

You might also like