0% found this document useful (0 votes)
5 views33 pages

Merge

The document outlines two practical exercises involving data processing and machine learning using datasets from Kaggle. The first exercise focuses on housing data, where a linear regression model is built to predict housing prices, while the second exercise deals with sentiment analysis of movie reviews from IMDB. Both exercises involve data preparation, model training, and evaluation using Python libraries such as pandas, numpy, and PyTorch.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views33 pages

Merge

The document outlines two practical exercises involving data processing and machine learning using datasets from Kaggle. The first exercise focuses on housing data, where a linear regression model is built to predict housing prices, while the second exercise deals with sentiment analysis of movie reviews from IMDB. Both exercises involve data preparation, model training, and evaluation using Python libraries such as pandas, numpy, and PyTorch.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 33

practical_1

February 6, 2024

[ ]: # name = "schirmerchad/bostonhoustingmlnd"
# dataset = name.split("/")[1] + ".zip"
# # Mount your Google Drive.
# from google.colab import drive
# drive.mount("/content/drive")

# kaggle_creds_path = "kaggle_token/kaggle.json"

# ! pip install kaggle --quiet

# ! mkdir ~/.kaggle
# ! cp "/content/drive/MyDrive/kaggle_token/kaggle.json" ~/.kaggle/
# ! chmod 600 ~/.kaggle/kaggle.json

# ! kaggle datasets download -d {name}

# ! mkdir kaggle_data
# ! unzip {dataset} -d kaggle_data

# # Unmount your Google Drive


# drive.flush_and_unmount()

Mounted at /content/drive
mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading bostonhoustingmlnd.zip to /content
0% 0.00/4.35k [00:00<?, ?B/s]
100% 4.35k/4.35k [00:00<00:00, 10.8MB/s]
mkdir: cannot create directory ‘kaggle_data’: File exists
Archive: bostonhoustingmlnd.zip
inflating: kaggle_data/housing.csv

[ ]: import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

[ ]: df = pd.read_csv("kaggle_data/housing.csv")

1
[ ]: device = "cuda" if torch.cuda.is_available() else "cpu"

[ ]: from sklearn.model_selection import train_test_split

[ ]: df

[ ]: RM LSTAT PTRATIO MEDV


0 6.575 4.98 15.3 504000.0
1 6.421 9.14 17.8 453600.0
2 7.185 4.03 17.8 728700.0
3 6.998 2.94 18.7 701400.0
4 7.147 5.33 18.7 760200.0
.. … … … …
484 6.593 9.67 21.0 470400.0
485 6.120 9.08 21.0 432600.0
486 6.976 5.64 21.0 501900.0
487 6.794 6.48 21.0 462000.0
488 6.030 7.88 21.0 249900.0

[489 rows x 4 columns]

[ ]: df.isna().sum()

[ ]: RM 0
LSTAT 0
PTRATIO 0
MEDV 0
dtype: int64

[ ]: from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

[ ]:

[ ]: X = df.drop('MEDV', axis=1)
y = df['MEDV']

X = np.array(X)
y = np.array(y)

scaler = StandardScaler()

X = scaler.fit_transform(X)
y = scaler.fit_transform(y.reshape(-1, 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,␣


↪random_state=42)

2
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (391, 3)


y_train shape: (391, 1)
X_test shape: (98, 3)
y_test shape: (98, 1)

[ ]:

[ ]: def␣
↪plot(train_data=X_train,train_labels=y_train,test_data=X_test,test_labels=y_test,predictions

plt.figure(figsize=(10,7))

plt.scatter(train_data,train_labels,c="b",s=4,label="Training data")

plt.scatter(test_data,test_labels,c="r",s=4,label="Testing Data")

if predictions is not None:


plt.scatter(test_data,predictions,c="g",label="Predictions")
x_values = np.linspace(-4, 4, 100)

y_values = weight * x_values + bias

plt.plot(x_values, y_values, color='red', label=f'y = {weight}x + {bias}')

plt.legend(prop={"size":14})

[ ]: plot(train_data=X_train.T[0], test_data=X_test.T[0])

3
[ ]: class Model(torch.nn.Module):
def __init__(self):
super().__init__()
self.linear_layer = torch.nn.Linear(in_features=3,out_features=1)

def forward(self,x:torch.Tensor) -> torch.Tensor:


return self.linear_layer(x)

[ ]: torch.manual_seed(42)

[ ]: <torch._C.Generator at 0x7ea8f1b79630>

[ ]: model = Model()

[ ]: loss = torch.nn.L1Loss() # mae

[ ]: opt = torch.optim.SGD(lr=0.01,params=model.parameters())

[ ]: X_train, X_test, y_train, y_test = torch.tensor(X_train, dtype=torch.


↪float),torch.tensor(X_test,dtype=torch.float),torch.

↪tensor(y_train,dtype=torch.float),torch.tensor(y_test,dtype=torch.float)

<ipython-input-113-e7e4131fbf59>:1: UserWarning: To copy construct from a

4
tensor, it is recommended to use sourceTensor.clone().detach() or
sourceTensor.clone().detach().requires_grad_(True), rather than
torch.tensor(sourceTensor).
X_train, X_test, y_train, y_test = torch.tensor(X_train, dtype=torch.float),to
rch.tensor(X_test,dtype=torch.float),torch.tensor(y_train,dtype=torch.float),tor
ch.tensor(y_test,dtype=torch.float)

[ ]:

[ ]: epochs = 200
for epoch in tqdm(range(epochs)):
model.train()

# fw pass
y_pred = model(X_train)

lossval = loss(y_pred,y_train)

opt.zero_grad()

# back prop

lossval.backward()

# optimizer

opt.step()

# TEST

model.eval()

with torch.inference_mode():
test_pred = model(X_test)
test_loss = loss(test_pred,y_test)

# print

if epoch % 10 == 0:
print(f"epoch {epoch}, loss {lossval}, test_loss {test_loss}")

0%| | 0/200 [00:00<?, ?it/s]


epoch 0, loss 0.8999658823013306, test_loss 0.9410500526428223
epoch 10, loss 0.8269047737121582, test_loss 0.8732254505157471
epoch 20, loss 0.7580215930938721, test_loss 0.8083762526512146
epoch 30, loss 0.6956650018692017, test_loss 0.7480545043945312
epoch 40, loss 0.6434133052825928, test_loss 0.6937804818153381

5
epoch 50, loss 0.6047266125679016, test_loss 0.6513829231262207
epoch 60, loss 0.5723316073417664, test_loss 0.6149174571037292
epoch 70, loss 0.543216347694397, test_loss 0.58405601978302
epoch 80, loss 0.5205875635147095, test_loss 0.5580319166183472
epoch 90, loss 0.5055828094482422, test_loss 0.5371074676513672
epoch 100, loss 0.4923928380012512, test_loss 0.5179988145828247
epoch 110, loss 0.48150429129600525, test_loss 0.5013106465339661
epoch 120, loss 0.472374826669693, test_loss 0.4893476665019989
epoch 130, loss 0.46420061588287354, test_loss 0.4796214699745178
epoch 140, loss 0.45730704069137573, test_loss 0.4711703956127167
epoch 150, loss 0.450693279504776, test_loss 0.46309569478034973
epoch 160, loss 0.4443809688091278, test_loss 0.4567875862121582
epoch 170, loss 0.4393230974674225, test_loss 0.45154622197151184
epoch 180, loss 0.4351387619972229, test_loss 0.4470271170139313
epoch 190, loss 0.43130671977996826, test_loss 0.4429181218147278

[ ]: y_preds = []
model.eval()
with torch.inference_mode():
test_pred = model(X_test)
y_preds.append(test_pred.numpy())

[ ]:

[ ]: model.state_dict()

[ ]: OrderedDict([('linear_layer.weight', tensor([[ 0.6438, -0.1606, -0.2250]])),


('linear_layer.bias', tensor([0.0866]))])

[ ]: #y = weight * X + bias

[ ]: weight = model.state_dict()["linear_layer.weight"][0][0]

[ ]: bias = model.state_dict()["linear_layer.bias"][0]

[ ]: plot(train_data=X_train.T[0], test_data=X_test.
↪T[0],predictions=y_preds,weight=weight.item(),bias=bias.item())

6
[ ]:

[ ]:

[ ]:

7
practical_2

February 6, 2024

[ ]: name = "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews"
dataset = name.split("/")[1] + ".zip"
# Mount your Google Drive.
from google.colab import drive
drive.mount("/content/drive")

kaggle_creds_path = "kaggle_token/kaggle.json"

! pip install kaggle --quiet

! mkdir ~/.kaggle
! cp "/content/drive/MyDrive/kaggle_token/kaggle.json" ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

! kaggle datasets download -d {name}

! mkdir kaggle_data
! unzip {dataset} -d kaggle_data

# Unmount your Google Drive


drive.flush_and_unmount()

Mounted at /content/drive
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
66% 17.0M/25.7M [00:00<00:00, 69.4MB/s]
100% 25.7M/25.7M [00:00<00:00, 85.5MB/s]
Archive: imdb-dataset-of-50k-movie-reviews.zip
inflating: kaggle_data/IMDB Dataset.csv

[ ]: import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')

1
nltk.download('wordnet')
nltk.download('punkt')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data…


[nltk_data] Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data…
[nltk_data] Downloading package punkt to /root/nltk_data…
[nltk_data] Unzipping tokenizers/punkt.zip.

[ ]: import os
from tqdm.auto import tqdm
from collections import Counter

[ ]: df = pd.read_csv("kaggle_data/IMDB Dataset.csv")

[ ]: def transform_label(label):
return 1 if label == 'positive' else 0

x = []
for i in df["sentiment"]:
x.append(transform_label(i))

df["label"] = x
df.head()

[ ]: review sentiment label


0 One of the other reviewers has mentioned that … positive 1
1 A wonderful little production. <br /><br />The… positive 1
2 I thought this was a wonderful way to spend ti… positive 1
3 Basically there's a family where a little boy … negative 0
4 Petter Mattei's "Love in the Time of Money" is… positive 1

[ ]: df['token_length'] = df.review.apply(lambda x: len(x.split()))

[ ]: data_pos = df[df['label'] == 1]
data_pos['token_length'].describe()

[ ]: count 25000.000000
mean 232.849320
std 177.497046
min 10.000000
25% 125.000000
50% 172.000000
75% 284.000000
max 2470.000000
Name: token_length, dtype: float64

2
[ ]: def process_text(text):
text = word_tokenize(text)
text = [i for i in text if i not in stopwords]
lemmatizer = WordNetLemmatizer()
text = [lemmatizer.lemmatize(t) for t in text]
text = [i for i in text if i not in stopwords]
return ' '.join(text)

[ ]: #df['clean'] = df['review'].apply(process_text)
df['processed'] = df['review'].apply(process_text)
df.head()

[ ]: review sentiment label \


0 One of the other reviewers has mentioned that … positive 1
1 A wonderful little production. <br /><br />The… positive 1
2 I thought this was a wonderful way to spend ti… positive 1
3 Basically there's a family where a little boy … negative 0
4 Petter Mattei's "Love in the Time of Money" is… positive 1

token_length processed
0 307 One reviewer mentioned watching 1 Oz episode '…
1 162 A wonderful little production . < br / > < br …
2 166 I thought wonderful way spend time hot summer …
3 138 Basically 's family little boy ( Jake ) think …
4 230 Petter Mattei 's `` Love Time Money '' visuall…

[ ]: reviews = df.processed.values
words = ' '.join(reviews)
words = words.split()
words[:10]

[ ]: ['One',
'reviewer',
'mentioned',
'watching',
'1',
'Oz',
'episode',
"'ll",
'hooked',
'.']

[ ]: counter = Counter(words)
vocab = sorted(counter, key=counter.get, reverse=True)
int2word = dict(enumerate(vocab, 1))
int2word[0] = '<PAD>'
word2int = {word: id for id, word in int2word.items()}

3
[ ]: reviews_enc = [[word2int[word] for word in review.split()] for review in␣
↪tqdm(reviews)]

# print first-10 words of first 5 reviews


for i in range(5):
print(reviews_enc[i][:5])

0%| | 0/50000 [00:00<?, ?it/s]


[212, 1097, 970, 97, 409]
[66, 341, 74, 268, 2]
[7, 116, 341, 44, 1036]
[2376, 8, 160, 74, 269]
[87832, 11029, 8, 16, 1122]

[ ]: def pad_features(reviews, pad_id, seq_length=128):


# features = np.zeros((len(reviews), seq_length), dtype=int)
features = np.full((len(reviews), seq_length), pad_id, dtype=int)

for i, row in enumerate(reviews):


# if seq_length < len(row) then review will be trimmed
features[i, :len(row)] = np.array(row)[:seq_length]

return features

seq_length = 256
features = pad_features(reviews_enc, pad_id=word2int['<PAD>'],␣
↪seq_length=seq_length)

assert len(features) == len(reviews_enc)


assert len(features[0]) == seq_length

features[:10, :10]

[ ]: array([[ 212, 1097, 970, 97, 409, 3810, 204, 161, 3069,
2],
[ 66, 341, 74, 268, 2, 5, 6, 3, 4,
5],
[ 7, 116, 341, 44, 1036, 23, 923, 1794, 2653,
1],
[ 2376, 8, 160, 74, 269, 13, 3233, 12, 50,
8],
[87832, 11029, 8, 16, 1122, 2011, 7681, 14, 2255,
1321],
[ 2788, 3730, 386, 9, 1, 28, 47677, 1, 3106,
7808],
[ 7, 220, 26, 19, 30, 13224, 1925, 65527, 151,
8291],

4
[ 22, 43, 448, 1, 1367, 107, 4044, 186, 891,
8],
[54585, 1000, 376, 10, 7, 217, 853, 97, 10,
2],
[ 78, 19, 141, 2602, 9168, 2087, 19, 9, 2,
78]])

[ ]: labels = df.label.to_numpy()
labels

[ ]: array([1, 1, 1, …, 0, 0, 0])

[ ]: train_size = .7
val_size = .5

split_id = int(len(features) * train_size)


train_x, remain_x = features[:split_id], features[split_id:]
train_y, remain_y = labels[:split_id], labels[split_id:]

split_val_id = int(len(remain_x) * val_size)


val_x, test_x = remain_x[:split_val_id], remain_x[split_val_id:]
val_y, test_y = remain_y[:split_val_id], remain_y[split_val_id:]

print('Feature Shapes:')
print('===============')
print('Train set: {}'.format(train_x.shape))
print('Validation set: {}'.format(val_x.shape))
print('Test set: {}'.format(test_x.shape))

Feature Shapes:
===============
Train set: (35000, 256)
Validation set: (7500, 256)
Test set: (7500, 256)

[ ]: print(len(train_y[train_y == 0]), len(train_y[train_y == 1]))


print(len(val_y[val_y == 0]), len(val_y[val_y == 1]))
print(len(test_y[test_y == 0]), len(test_y[test_y == 1]))

17510 17490
3753 3747
3737 3763

[ ]: import torch
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
from torch.optim import Adam

5
[ ]: batch_size = 128

trainset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))


validset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
testset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

trainloader = DataLoader(trainset, shuffle=True, batch_size=batch_size)


valloader = DataLoader(validset, shuffle=True, batch_size=batch_size)
testloader = DataLoader(testset, shuffle=True, batch_size=batch_size)

[ ]: class SentimentModel(nn.Module):
def __init__(self, vocab_size, output_size, hidden_size=128,␣
↪embedding_size=400, n_layers=2, dropout=0.2):

super(SentimentModel, self).__init__()

# embedding layer is useful to map input into vector representation


self.embedding = nn.Embedding(vocab_size, embedding_size)

# LSTM layer preserved by PyTorch library


self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers,␣
↪dropout=dropout, batch_first=True)

# dropout layer
self.dropout = nn.Dropout(0.3)

# Linear layer for output


self.fc = nn.Linear(hidden_size, output_size)

# Sigmoid layer cz we will have binary classification


self.sigmoid = nn.Sigmoid()

def forward(self, x):

# convert feature to long


x = x.long()

# map input to vector


x = self.embedding(x)

# pass forward to lstm


o, _ = self.lstm(x)

# get last sequence output


o = o[:, -1, :]

# apply dropout and fully connected layer


o = self.dropout(o)

6
o = self.fc(o)

# sigmoid
o = self.sigmoid(o)

return o

[ ]: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


print(device)

cuda

[ ]: vocab_size = len(word2int)
output_size = 1
embedding_size = 256
hidden_size = 512
n_layers = 2
dropout=0.25

model = SentimentModel(vocab_size, output_size, hidden_size, embedding_size,␣


↪n_layers, dropout)

print(model)

SentimentModel(
(embedding): Embedding(186157, 256)
(lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.25)
(dropout): Dropout(p=0.3, inplace=False)
(fc): Linear(in_features=512, out_features=1, bias=True)
(sigmoid): Sigmoid()
)

[ ]: lr = 0.001
criterion = nn.BCELoss() # we use BCELoss cz we have binary classification␣
↪problem

optim = Adam(model.parameters(), lr=lr)


grad_clip = 5
epochs = 8
print_every = 1
history = {
'train_loss': [],
'train_acc': [],
'val_loss': [],
'val_acc': [],
'epochs': epochs
}
es_limit = 5

7
[ ]: model = model.to(device)

epochloop = tqdm(range(epochs), position=0, desc='Training', leave=True)

# early stop trigger


es_trigger = 0
val_loss_min = torch.inf

for e in epochloop:
model.train()

train_loss = 0
train_acc = 0

for id, (feature, target) in enumerate(trainloader):


# add epoch meta info
epochloop.set_postfix_str(f'Training batch {id}/{len(trainloader)}')

feature, target = feature.to(device), target.to(device)

optim.zero_grad()

out = model(feature)

predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5],␣


↪device=device)
equals = predicted == target
acc = torch.mean(equals.type(torch.FloatTensor))
train_acc += acc.item()

loss = criterion(out.squeeze(), target.float())


train_loss += loss.item()
loss.backward()

# clip grad
# nn.utils.clip_grad_norm_(model.parameters(), grad_clip)

optim.step()

del feature, target, predicted

history['train_loss'].append(train_loss / len(trainloader))
history['train_acc'].append(train_acc / len(trainloader))

model.eval()

8
val_loss = 0
val_acc = 0

with torch.inference_mode():
for id, (feature, target) in enumerate(valloader):
epochloop.set_postfix_str(f'Validation batch {id}/{len(valloader)}')

feature, target = feature.to(device), target.to(device)

out = model(feature)

predicted = torch.tensor([1 if i == True else 0 for i in out > 0.


↪5], device=device)
equals = predicted == target
acc = torch.mean(equals.type(torch.FloatTensor))
val_acc += acc.item()

loss = criterion(out.squeeze(), target.float())


val_loss += loss.item()

del feature, target, predicted

history['val_loss'].append(val_loss / len(valloader))
history['val_acc'].append(val_acc / len(valloader))

# reset model mode


model.train()

# add epoch meta info


epochloop.set_postfix_str(f'Val Loss: {val_loss / len(valloader):.3f} | Val␣
↪Acc: {val_acc / len(valloader):.3f}')

# print epoch
if (e+1) % print_every == 0:
epochloop.write(f'Epoch {e+1}/{epochs} | Train Loss: {train_loss /␣
↪len(trainloader):.3f} Train Acc: {train_acc / len(trainloader):.3f} | Val␣

↪Loss: {val_loss / len(valloader):.3f} Val Acc: {val_acc / len(valloader):.

↪3f}')

epochloop.update()

# save model if validation loss decrease


if val_loss / len(valloader) <= val_loss_min:
torch.save(model.state_dict(), './sentiment_lstm.pt')
val_loss_min = val_loss / len(valloader)
es_trigger = 0
else:

9
epochloop.write(f'[WARNING] Validation loss did not improved␣
↪({val_loss_min:.3f} --> {val_loss / len(valloader):.3f})')
es_trigger += 1

# force early stop


if es_trigger >= es_limit:
epochloop.write(f'Early stopped at Epoch-{e+1}')
# update epochs history
history['epochs'] = e+1
break

Training: 0%| | 0/8 [00:00<?, ?it/s]


Epoch 1/8 | Train Loss: 0.694 Train Acc: 0.504 | Val Loss: 0.692 Val Acc: 0.510
Epoch 2/8 | Train Loss: 0.690 Train Acc: 0.516 | Val Loss: 0.691 Val Acc: 0.500
Epoch 3/8 | Train Loss: 0.693 Train Acc: 0.511 | Val Loss: 0.694 Val Acc: 0.498
[WARNING] Validation loss did not improved (0.691 --> 0.694)
Epoch 4/8 | Train Loss: 0.694 Train Acc: 0.505 | Val Loss: 0.693 Val Acc: 0.508
[WARNING] Validation loss did not improved (0.691 --> 0.693)
Epoch 5/8 | Train Loss: 0.692 Train Acc: 0.508 | Val Loss: 0.695 Val Acc: 0.509
[WARNING] Validation loss did not improved (0.691 --> 0.695)
Epoch 6/8 | Train Loss: 0.682 Train Acc: 0.536 | Val Loss: 0.671 Val Acc: 0.668
Epoch 7/8 | Train Loss: 0.631 Train Acc: 0.640 | Val Loss: 0.612 Val Acc: 0.730
Epoch 8/8 | Train Loss: 0.447 Train Acc: 0.806 | Val Loss: 0.417 Val Acc: 0.821

[ ]: model.eval()

# metrics
test_loss = 0
test_acc = 0

all_target = []
all_predicted = []

testloop = tqdm(testloader, leave=True, desc='Inference')


with torch.no_grad():
for feature, target in testloop:
feature, target = feature.to(device), target.to(device)

out = model(feature)

predicted = torch.tensor([1 if i == True else 0 for i in out > 0.5],␣


↪device=device)

equals = predicted == target


acc = torch.mean(equals.type(torch.FloatTensor))
test_acc += acc.item()

loss = criterion(out.squeeze(), target.float())

10
test_loss += loss.item()

all_target.extend(target.cpu().numpy())
all_predicted.extend(predicted.cpu().numpy())

print(f'Accuracy: {test_acc/len(testloader):.4f}, Loss: {test_loss/


↪len(testloader):.4f}')

Inference: 0%| | 0/59 [00:00<?, ?it/s]


Accuracy: 0.8260, Loss: 0.3995

[ ]: from sklearn.metrics import classification_report, confusion_matrix


import seaborn as sns

[ ]: print(classification_report(all_predicted, all_target))

precision recall f1-score support

0 0.89 0.79 0.84 4191


1 0.77 0.87 0.82 3309

accuracy 0.83 7500


macro avg 0.83 0.83 0.83 7500
weighted avg 0.83 0.83 0.83 7500

[ ]: cm = confusion_matrix(all_predicted, all_target)
plt.figure(figsize=(5,5))
sns.heatmap(cm, annot=True, fmt='g')
plt.title('Confusion Matrix')
plt.show()

11
12
practical_3_fashionMNIST

February 6, 2024

[ ]: import torch
from torch import nn
import torchvision
from torchvision import datasets
from torchvision import transforms
from torchvision.transforms import ToTensor

import matplotlib.pyplot as plt

[ ]: torch.__version__

[ ]: '2.1.0+cu121'

[ ]: torchvision.__version__

[ ]: '0.16.0+cu121'

[ ]: train_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor(),
target_transform=None
)

test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor(),
target_transform=None
)

[ ]: train_data,test_data

1
[ ]: (Dataset FashionMNIST
Number of datapoints: 60000
Root location: data
Split: Train
StandardTransform
Transform: ToTensor(),
Dataset FashionMNIST
Number of datapoints: 10000
Root location: data
Split: Test
StandardTransform
Transform: ToTensor())

[ ]: class_names = train_data.classes

[ ]: fig = plt.figure(figsize=(10,10))
rows,cols = 4,4
for i in range(1,rows*cols+1):
random_idx = torch.randint(0,len(train_data),size=[1]).item()
img,label = train_data[random_idx]
fig.add_subplot(rows,cols,i)
plt.imshow(img.squeeze(),cmap="gray")
plt.title(class_names[label])

2
[ ]: from torch.utils.data import DataLoader

[ ]: BATCH_SIZE = 32
train_dataloader =␣
↪DataLoader(dataset=train_data,batch_size=BATCH_SIZE,shuffle=True)

test_dataloader =␣
↪DataLoader(dataset=test_data,batch_size=BATCH_SIZE,shuffle=False)

train_dataloader,test_dataloader

3
[ ]: (<torch.utils.data.dataloader.DataLoader at 0x79cace64b460>,
<torch.utils.data.dataloader.DataLoader at 0x79cace64b7c0>)

[ ]: train_features_batch, train_labels_batch = next(iter(train_dataloader))

[ ]: # torch.manual_seed(42)
random_idx = torch.randint(0,len(train_features_batch),size=[1]).item()

img,label = train_features_batch[random_idx],train_labels_batch[random_idx]
plt.imshow(img.squeeze(),cmap="gray")
plt.title(class_names[label])
plt.axis(False)

[ ]: (-0.5, 27.5, 27.5, -0.5)

[ ]: flatten_model = nn.Flatten()

x = train_features_batch[0]

output = flatten_model(x)

4
[ ]: output.squeeze().shape

[ ]: torch.Size([784])

[ ]: import requests
from pathlib import Path
if Path("helper_functions.py").is_file():
print("Helper Functions exists, no download required")
else:
print()
req = requests.get("https://raw.githubusercontent.com/mrdbourke/
↪pytorch-deep-learning/main/helper_functions.py")

with open("helper_functions.py","wb") as f:
f.write(req.content)

import helper_functions

[ ]: from helper_functions import accuracy_fn

[ ]: from timeit import default_timer as timer

def print_train_time(start:float,end:float,device:torch.device=None):
total = end-start
print(f"Train time on {device}: {total:.3f} secs")

[ ]: from tqdm.auto import tqdm

[ ]: import torch
device = 'cuda' if torch.cuda.is_available() else "cpu"

[ ]: device

[ ]: 'cuda'

[ ]: def train_step(
model:torch.nn.Module,
data_loader:torch.utils.data.DataLoader,
loss_fn:torch.nn.Module,
optimizer:torch.optim.Optimizer,
accuracy_fn,
device:torch.device):

train_loss,train_acc = 0,0

model.train()

5
for batch,(X,y) in enumerate(data_loader):
X, y = X.to(device),y.to(device)

y_pred = model(X)

loss = loss_fn(y_pred,y)
train_loss += loss
train_acc += accuracy_fn(y_true=y,y_pred=y_pred.argmax(dim=1))

optimizer.zero_grad()

loss.backward()

optimizer.step()

train_loss /= len(data_loader)
train_acc /= len(data_loader)
print(f"Train loss:{train_loss:.5f} | train acc:{train_acc:.2f}%")

def test_step(
model:torch.nn.Module,
data_loader:torch.utils.data.DataLoader,
loss_fn:torch.nn.Module,
accuracy_fn,
device:torch.device
):
## test loop
test_loss,test_acc = 0,0
model.eval()
with torch.inference_mode():
for X,y in data_loader:
X, y = X.to(device),y.to(device)

test_pred = model(X)

test_loss += loss_fn(test_pred,y)

test_acc += accuracy_fn(y_true=y,y_pred=test_pred.argmax(dim=1))

test_loss /= len(test_dataloader)

test_acc /= len(test_dataloader)

6
print(f"test loss: {test_loss:.5f} test_acc: {test_acc:.2f}")

[ ]: torch.manual_seed(42)
def eval_model(model:nn.Module,data_loader:torch.utils.data.DataLoader,loss_fn:
↪torch.nn.Module,accuracy_fn,device):

"""Returns dict containing containing results of mdoel predicting on␣


↪data_loader"""

loss,acc = 0,0
model.eval()
with torch.inference_mode():
for X,y in tqdm(data_loader):
X,y = X.to(device),y.to(device)
y_pred = model(X)

loss += loss_fn(y_pred,y)
acc += accuracy_fn(y_true=y,y_pred=y_pred.argmax(dim=1))

# scale loss and acc to find avg acc loss

loss /= len(data_loader)

acc /= len(data_loader)

return {"model":model.__class__.__name__, "model_loss":loss.


↪item(),"model_acc":acc}

[ ]: class FashionMNISTModelV2CNN(nn.Module):
"""
TinyVGG architecture
"""

def __init__(self,input_shape:int,hidden_units:int,output_shape:int):
super().__init__()
self.conv_block_1 = nn.Sequential(
nn.Conv2d(in_channels=input_shape,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=hidden_units,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)

7
)

self.conv_block_2 = nn.Sequential(
nn.Conv2d(in_channels=hidden_units,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=1),
nn.ReLU(),
nn.Conv2d(in_channels=hidden_units,
out_channels=hidden_units,
kernel_size=3,
stride=1,
padding=1),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)

self.classifier = nn.Sequential(
nn.Flatten(),
nn.Linear(in_features=hidden_units*49,out_features=output_shape)
)

def forward(self,x):
x = self.conv_block_1(x)
# print(x.shape)
x = self.conv_block_2(x)
# print(x.shape)
x = self.classifier(x)
return x

[ ]: from helper_functions import accuracy_fn

[ ]: # train and test

[ ]: torch.manual_seed(42)

model2 = FashionMNISTModelV2CNN(
input_shape=1,
hidden_units=10,
output_shape=len(class_names)
).to(device)

loss_fn = nn.CrossEntropyLoss()

8
optimizer = torch.optim.SGD(params=model2.parameters(),lr=0.1)

torch.manual_seed(42)
torch.cuda.manual_seed(42)

from timeit import default_timer


train_time_start = timer()

epochs = 3

for epoch in tqdm(range(epochs)):


print(f"\nEpoch: {epoch} \n----------------")

train_step(
model=model2,
data_loader=train_dataloader,
loss_fn=loss_fn,
optimizer=optimizer,
accuracy_fn=accuracy_fn,
device=device
)

test_step(
model=model2,
data_loader=test_dataloader,
loss_fn=loss_fn,
accuracy_fn=accuracy_fn,
device=device
)

train_end_time = timer()

total_time_train = print_train_time(
start = train_time_start,
end = train_end_time,
device=device
)

0%| | 0/3 [00:00<?, ?it/s]

Epoch: 0

9
----------------
Train loss:0.59443 | train acc:78.32%
test loss: 0.39969 test_acc: 86.04

Epoch: 1
----------------
Train loss:0.35867 | train acc:87.27%
test loss: 0.35115 test_acc: 87.08

Epoch: 2
----------------
Train loss:0.31848 | train acc:88.45%
test loss: 0.31587 test_acc: 88.79
Train time on cuda: 37.646 secs

[ ]: model2_results = eval_model(
model=model2,
data_loader=test_dataloader,
accuracy_fn=accuracy_fn,
loss_fn=loss_fn,
device=device
)

0%| | 0/313 [00:00<?, ?it/s]

[ ]: model2_results

[ ]: {'model': 'FashionMNISTModelV2CNN',
'model_loss': 0.3158661723136902,
'model_acc': 88.7879392971246}

[ ]: def make_predictions(
model,
data,
device,
):
pred_probs = []

model.eval()
with torch.inference_mode():
for sample in data:
sample = torch.unsqueeze(sample,dim=0).to(device)

pred_logit = model(sample)

pred_prob = torch.softmax(pred_logit.squeeze(),dim=0)

10
pred_probs.append(pred_prob.cpu())

return torch.stack(pred_probs)

[ ]: test_data

[ ]: Dataset FashionMNIST
Number of datapoints: 10000
Root location: data
Split: Test
StandardTransform
Transform: ToTensor()

[ ]:

[ ]: import random
test_samples = []
test_labels = []

for sample,label in random.sample(list(test_data),k=9):


test_samples.append(sample)
test_labels.append(label)

[ ]:

[ ]:

[ ]: pred_probs = make_predictions(
model=model2,
data=test_samples,
device=device
)

[ ]: pred_classes = pred_probs.argmax(dim=1)

[ ]: plt.figure(figsize=(9,9))
rows = 3
cols = 3
for i,sample in enumerate(test_samples):
plt.subplot(rows,cols,i+1)
plt.imshow(sample.squeeze(),cmap="gray")

pred_label = class_names[pred_classes[i]]

11
truth_label = class_names[test_labels[i]]

title_text = f"Pred: {pred_label} | Truth: {truth_label}"

if pred_label == truth_label:
plt.title(title_text,fontsize=10,c="g")
else:
plt.title(title_text,fontsize=10,c="r")

plt.axis(False)

12
[ ]: try:
import torchmetrics,mlxtend
print(mlxtend.__version__)
assert int(mlxtend.__version__.split(".")[1]) >= 19, "mlxtend should be␣
↪greater than 0.19.0"

except:
!pip install torchmetrics mlxtend -U
import torchmetrics,mlxtend

[ ]: mlxtend.__version__

[ ]: '0.23.1'

[ ]: from tqdm.auto import tqdm

[ ]: y_preds = []
model2.eval()
with torch.inference_mode():
for X,y in tqdm(test_dataloader,desc="Make pred"):
X,y = X.to(device),y.to(device)

y_logits = model2(X)

y_pred = torch.softmax(y_logits.squeeze(),dim=0).argmax(dim=1)

y_preds.append(y_pred.cpu())

y_pred_tensor = torch.cat(y_preds)

Make pred: 0%| | 0/313 [00:00<?, ?it/s]

[ ]: y_pred_tensor

[ ]: tensor([9, 2, 1, …, 8, 1, 6])

[ ]: from torchmetrics import ConfusionMatrix


from mlxtend.plotting import plot_confusion_matrix

confmat = ConfusionMatrix(task="multiclass",num_classes=len(class_names))
conf_mat_tensor= confmat(preds=y_pred_tensor,target=test_data.targets)

fig,ax = plot_confusion_matrix(conf_mat=conf_mat_tensor.
↪numpy(),class_names=class_names,figsize=(10,7))

13
[ ]:

[ ]:

14

You might also like