BERT - Ipynb - Colaboratory
BERT - Ipynb - Colaboratory
ipynb - Colaboratory
# specify GPU
#device = torch.device("cuda")
uploaded = files.upload()
Choose Files No file chosen Upload widget is only available when the cell has been
executed in the current browser session. Please rerun this cell to enable.
Saving spam.csv to spam.csv
0 ham Go until jurong point, crazy.. Available only ... NaN NaN NaN
label message
df['label_num']=df.label.map({'ham':0,'spam':1})
df.head()
0 0.865937
1 0.134063
Name: label_num, dtype: float64
https://colab.research.google.com/drive/1EOYF-YXlpoImo-EU8X-N1pNuYAtj27Rw#scrollTo=bjCduruPRkq6&printMode=true 1/6
11/22/23, 4:13 PM BERT.ipynb - Colaboratory
# split train dataset into train, validation and test sets
train_text, temp_text, train_labels, temp_labels = train_test_split(df['message'], df['label_num'],
random_state=2018,
test_size=0.3,
stratify=df['label_num'])
100% 113MB/s]
100% 585B/s]
pd.Series(seq_len).hist(bins = 30)
<Axes: >
https://colab.research.google.com/drive/1EOYF-YXlpoImo-EU8X-N1pNuYAtj27Rw#scrollTo=bjCduruPRkq6&printMode=true 2/6
11/22/23, 4:13 PM BERT.ipynb - Colaboratory
# tokenize and encode sequences in the training set
tokens_train = tokenizer.batch_encode_plus(
train_text.tolist(),
max_length = 25,
pad_to_max_length=True,
truncation=True
)
train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())
# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)
# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)
https://colab.research.google.com/drive/1EOYF-YXlpoImo-EU8X-N1pNuYAtj27Rw#scrollTo=bjCduruPRkq6&printMode=true 3/6
11/22/23, 4:13 PM BERT.ipynb - Colaboratory
class BERT_Arch(nn.Module):
self.bert = bert
# dropout layer
self.dropout = nn.Dropout(0.1)
# dense layer 1
self.fc1 = nn.Linear(768,512)
x = self.fc1(cls_hs)
x = self.relu(x)
x = self.dropout(x)
# output layer
x = self.fc2(x)
return x
y = train_labels
classes=np.unique(y)
print('Class Weights:',class_weights)
https://colab.research.google.com/drive/1EOYF-YXlpoImo-EU8X-N1pNuYAtj27Rw#scrollTo=bjCduruPRkq6&printMode=true 4/6
11/22/23, 4:13 PM BERT.ipynb - Colaboratory
# converting list of class weights to a tensor
weights= torch.tensor(class_weights,dtype=torch.float)
# push to GPU
#weights = weights.to(device)
model.train()
total_loss, total_accuracy = 0, 0
# update parameters
optimizer.step()
https://colab.research.google.com/drive/1EOYF-YXlpoImo-EU8X-N1pNuYAtj27Rw#scrollTo=bjCduruPRkq6&printMode=true 5/6
11/22/23, 4:13 PM BERT.ipynb - Colaboratory
# function for evaluating the model
def evaluate():
print("\nEvaluating...")
total_loss, total_accuracy = 0, 0
# Report progress.
print(' Batch {:>5,} of {:>5,}.'.format(step, len(val_dataloader)))
# deactivate autograd
with torch.no_grad():
# model predictions
preds = model(sent_id, mask)
total_preds.append(preds)
#defining epochs
epochs = 20
# compute the validation loss of the epoch
avg_loss = total_loss / len(val_dataloader)
# empty lists to store training and validation loss of each epoch
train_losses=[]
# reshape the predictions in form of (number of samples, no. of classes)
valid_losses=[]
total_preds = np.concatenate(total_preds, axis=0)
#for each epoch
return avg_loss, total_preds
for epoch in range(epochs):
#train model
train_loss, _ = train()
#evaluate model
valid_loss, _ = evaluate()
https://colab.research.google.com/drive/1EOYF-YXlpoImo-EU8X-N1pNuYAtj27Rw#scrollTo=bjCduruPRkq6&printMode=true 6/6