0% found this document useful (0 votes)

9 views15 pages

DSC 253 Homework 1

Uploaded by

jazzy jazzy

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

9 views15 pages

DSC 253 Homework 1

Uploaded by

jazzy jazzy

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 15

10/17/24, 6:49 PM HW 1.

ipynb - Colab

import pandas as pd
from sklearn.model_selection import train_test_split

# Load the datasets

nyt = pd.read_csv('nyt.csv')
ag = pd.read_csv('ag.csv')

print(nyt.head())
print(ag.head())

text label
0 (reuters) - carlos tevez sealed his move to ju... sports
1 if professional pride and strong defiance can ... sports
2 palermo, sicily — roberta vinci beat top-seede... sports
3 spain's big two soccer teams face a pair of it... sports
4 the argentine soccer club san lorenzo complete... sports
text
0 wall st. bears claw back into the black (reute...
1 carlyle looks toward commercial aerospace (reu...
2 oil and economy cloud stocks' outlook (reuters...
3 iraq halts oil exports from main southern pipe...
4 oil prices soar to all-time record, posing new...

# Shuffle and split the NYT dataset

train_data, temp_data = train_test_split(nyt, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training size: {len(train_data)}")

print(f"Validation size: {len(val_data)}")
print(f"Test size: {len(test_data)}")

Training size: 9215

Validation size: 1152
Test size: 1152

!pip install nltk

Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)

Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk) (8.1.7)
Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk) (1.4.2)
Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.10/dist-packages (from nltk) (2024.9.11)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from nltk) (4.66.5)

import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# Preprocessing function
def preprocess_text(text):
text = text.lower()
text = re.sub(r'\W', ' ', text)
return text.split()

# # Apply preprocessing
nyt['processed_text'] = nyt['text'].apply(preprocess_text)
ag['processed_text'] = ag['text'].apply(preprocess_text)

# Function to train and evaluate a model

def train_evaluate(X_train, y_train, X_val, y_val, X_test, y_test):
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

# Predict on test set

y_pred = clf.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
micro_f1 = f1_score(y_test, y_pred, average='micro')

return accuracy, macro_f1, micro_f1

# Binary BoW - 1 (a)

binary_vectorizer = CountVectorizer(binary=True)

https://colab.research.google.com/drive/1LeuPtiOivZ1KaBBVycZ9tnzT8zX_J_Gx#scrollTo=eTa2tfw2UEpp&printMode=true 1/9
10/17/24, 6:49 PM HW 1.ipynb - Colab
X_train_binary = binary_vectorizer.fit_transform(train_data['text'])
X_val_binary = binary_vectorizer.transform(val_data['text'])
X_test_binary = binary_vectorizer.transform(test_data['text'])

binary_results = train_evaluate(X_train_binary, train_data['label'], X_val_binary, val_data['label'], X_test_binary, test_data['

print(f"Binary BoW - Accuracy: {binary_results[0]}, Macro-F1: {binary_results[1]}, Micro-F1: {binary_results[2]}")

Binary BoW - Accuracy: 0.9782986111111112, Macro-F1: 0.9401635276063338, Micro-F1: 0.9782986111111112

# Frequency BoW - 1 (b)

freq_vectorizer = CountVectorizer()
X_train_freq = freq_vectorizer.fit_transform(train_data['text'])
X_val_freq = freq_vectorizer.transform(val_data['text'])
X_test_freq = freq_vectorizer.transform(test_data['text'])

freq_results = train_evaluate(X_train_freq, train_data['label'], X_val_freq, val_data['label'], X_test_freq, test_data['label'])

print(f"Frequency BoW - Accuracy: {freq_results[0]}, Macro-F1: {freq_results[1]}, Micro-F1: {freq_results[2]}")

Frequency BoW - Accuracy: 0.9826388888888888, Macro-F1: 0.9535850489937898, Micro-F1: 0.9826388888888888

# TF-IDF BoW - 1 (c)

tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])
X_val_tfidf = tfidf_vectorizer.transform(val_data['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['text'])

tfidf_results = train_evaluate(X_train_tfidf, train_data['label'], X_val_tfidf, val_data['label'], X_test_tfidf, test_data['labe

print(f"TF-IDF BoW - Accuracy: {tfidf_results[0]}, Macro-F1: {tfidf_results[1]}, Micro-F1: {tfidf_results[2]}")

TF-IDF BoW - Accuracy: 0.9782986111111112, Macro-F1: 0.9440661087882298, Micro-F1: 0.9782986111111112

import pandas as pd
import gensim
from gensim.models import Word2Vec
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import re

# Load the datasets

nyt = pd.read_csv('nyt.csv')
ag = pd.read_csv('ag.csv')

# Preprocessing function
def preprocess_text(text):
text = text.lower()
text = re.sub(r'\W', ' ', text)
return text.split()

# Apply preprocessing
nyt['processed_text'] = nyt['text'].apply(preprocess_text)
ag['processed_text'] = ag['text'].apply(preprocess_text)

# Display the preprocessed data

print(nyt.head())
print(ag.head())

text label \
0 (reuters) - carlos tevez sealed his move to ju... sports
1 if professional pride and strong defiance can ... sports
2 palermo, sicily — roberta vinci beat top-seede... sports
3 spain's big two soccer teams face a pair of it... sports
4 the argentine soccer club san lorenzo complete... sports

processed_text
0 [reuters, carlos, tevez, sealed, his, move, to...
1 [if, professional, pride, and, strong, defianc...
2 [palermo, sicily, roberta, vinci, beat, top, s...
3 [spain, s, big, two, soccer, teams, face, a, p...
4 [the, argentine, soccer, club, san, lorenzo, c...
text \
0 wall st. bears claw back into the black (reute...
1 carlyle looks toward commercial aerospace (reu...
2 oil and economy cloud stocks' outlook (reuters...
3 iraq halts oil exports from main southern pipe...
4 oil prices soar to all-time record, posing new...

https://colab.research.google.com/drive/1LeuPtiOivZ1KaBBVycZ9tnzT8zX_J_Gx#scrollTo=eTa2tfw2UEpp&printMode=true 2/9
10/17/24, 6:49 PM HW 1.ipynb - Colab
processed_text
0 [wall, st, bears, claw, back, into, the, black...
1 [carlyle, looks, toward, commercial, aerospace...
2 [oil, and, economy, cloud, stocks, outlook, re...
3 [iraq, halts, oil, exports, from, main, southe...
4 [oil, prices, soar, to, all, time, record, pos...

import numpy as np

def load_glove_model(File):
print("Loading Glove Model")
glove_model = {}
with open(File,'r') as f:
for line in f:
split_line = line.split()
word = split_line[0]
embedding = np.array(split_line[1:], dtype=np.float64)
glove_model[word] = embedding
print(f"{len(glove_model)} words loaded!")
return glove_model

# Glove - 2 (a)

# Load pre-trained GloVe vectors (100-dimensional)

glove_vectors = load_glove_model("glove.6B.100d.txt")

# Function to get the document vector using GloVe

def get_document_vector_glove(doc):
vectors = [glove_vectors[word] for word in doc if word in glove_vectors]
return np.mean(vectors, axis=0) if vectors else np.zeros(100)

# Create document vectors for NYT dataset using GloVe

nyt['glove_vector'] = nyt['processed_text'].apply(get_document_vector_glove)

Loading Glove Model

400000 words loaded!

from sklearn.model_selection import train_test_split

# Shuffle and split the NYT dataset

train_data, temp_data = train_test_split(nyt, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

# Extract features and labels

X_train_glove = np.vstack(train_data['glove_vector'])
X_val_glove = np.vstack(val_data['glove_vector'])
X_test_glove = np.vstack(test_data['glove_vector'])

y_train = train_data['label']
y_val = val_data['label']
y_test = test_data['label']

# Function to train and evaluate a model

def train_evaluate(X_train, y_train, X_val, y_val, X_test, y_test):
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

# Predict on test set

y_pred = clf.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
micro_f1 = f1_score(y_test, y_pred, average='micro')

return accuracy, macro_f1, micro_f1

# Evaluate GloVe representation

glove_results = train_evaluate(X_train_glove, y_train, X_val_glove, y_val, X_test_glove, y_test)
print(f"GloVe - Accuracy: {glove_results[0]}, Macro-F1: {glove_results[1]}, Micro-F1: {glove_results[2]}")

GloVe - Accuracy: 0.9748263888888888, Macro-F1: 0.9350793298157348, Micro-F1: 0.9748263888888888

https://colab.research.google.com/drive/1LeuPtiOivZ1KaBBVycZ9tnzT8zX_J_Gx#scrollTo=eTa2tfw2UEpp&printMode=true 3/9
10/17/24, 6:49 PM HW 1.ipynb - Colab
# Word2Vec AG -> NYT - 2 (b)

# Train Word2Vec model on AG News dataset

w2v_ag = Word2Vec(sentences=ag['processed_text'], vector_size=100, window=5, min_count=2, sg=1, workers=4, seed=42)

# Function to get the document vector using AG News Word2Vec model

def get_document_vector_w2v_ag(doc):
vectors = [w2v_ag.wv[word] for word in doc if word in w2v_ag.wv]
return np.mean(vectors, axis=0) if vectors else np.zeros(100)

# Create document vectors for NYT dataset using AG News Word2Vec model
nyt['w2v_ag_vector'] = nyt['processed_text'].apply(get_document_vector_w2v_ag)

from sklearn.model_selection import train_test_split

# Shuffle and split the NYT dataset

train_data, temp_data = train_test_split(nyt, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

X_train_w2v_ag = np.vstack(train_data['w2v_ag_vector'])
X_val_w2v_ag = np.vstack(val_data['w2v_ag_vector'])
X_test_w2v_ag = np.vstack(test_data['w2v_ag_vector'])

y_train = train_data['label']
y_val = val_data['label']
y_test = test_data['label']

# Function to train and evaluate a model

def train_evaluate(X_train, y_train, X_val, y_val, X_test, y_test):
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

# Predict on test set

y_pred = clf.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
micro_f1 = f1_score(y_test, y_pred, average='micro')

return accuracy, macro_f1, micro_f1

# Evaluate Word2Vec (AG News) representation

w2v_ag_results = train_evaluate(X_train_w2v_ag, y_train, X_val_w2v_ag, y_val, X_test_w2v_ag, y_test)
print(f"Word2Vec (AG News) - Accuracy: {w2v_ag_results[0]}, Macro-F1: {w2v_ag_results[1]}, Micro-F1: {w2v_ag_results[2]}")

Word2Vec (AG News) - Accuracy: 0.9626736111111112, Macro-F1: 0.904249698978289, Micro-F1: 0.9626736111111112

# Word2Vec - 2 (c)

# Train Word2Vec model on NYT dataset

w2v_nyt = Word2Vec(sentences=nyt['processed_text'], vector_size=100, window=5, min_count=2, sg=1, workers=4, seed=42)

# Function to get the document vector using NYT Word2Vec model

def get_document_vector_w2v_nyt(doc):
vectors = [w2v_nyt.wv[word] for word in doc if word in w2v_nyt.wv]
return np.mean(vectors, axis=0) if vectors else np.zeros(100)

# Create document vectors for NYT dataset using NYT Word2Vec model
nyt['w2v_nyt_vector'] = nyt['processed_text'].apply(get_document_vector_w2v_nyt)

from sklearn.model_selection import train_test_split

# Shuffle and split the NYT dataset

train_data, temp_data = train_test_split(nyt, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

X_train_w2v_nyt = np.vstack(train_data['w2v_nyt_vector'])
X_val_w2v_nyt = np.vstack(val_data['w2v_nyt_vector'])
X_test_w2v_nyt = np.vstack(test_data['w2v_nyt_vector'])

y_train = train_data['label']

https://colab.research.google.com/drive/1LeuPtiOivZ1KaBBVycZ9tnzT8zX_J_Gx#scrollTo=eTa2tfw2UEpp&printMode=true 4/9
10/17/24, 6:49 PM HW 1.ipynb - Colab
y_val = val_data['label']
y_test = test_data['label']

# Function to train and evaluate a model

def train_evaluate(X_train, y_train, X_val, y_val, X_test, y_test):
clf = LogisticRegression(max_iter=1000, random_state=42)
clf.fit(X_train, y_train)

# Predict on test set

y_pred = clf.predict(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
micro_f1 = f1_score(y_test, y_pred, average='micro')

return accuracy, macro_f1, micro_f1

# Evaluate Word2Vec (NYT) representation

w2v_nyt_results = train_evaluate(X_train_w2v_nyt, y_train, X_val_w2v_nyt, y_val, X_test_w2v_nyt, y_test)
print(f"Word2Vec (NYT) - Accuracy: {w2v_nyt_results[0]}, Macro-F1: {w2v_nyt_results[1]}, Micro-F1: {w2v_nyt_results[2]}")

Word2Vec (NYT) - Accuracy: 0.9696180555555556, Macro-F1: 0.9263986085762167, Micro-F1: 0.9696180555555556

# BERT - 3 (a)

!pip install -q transformers

# Importing libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

# Setting up GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda

df = pd.read_csv('nyt.csv')
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['text', 'label']].copy()
new_df.head()

text label

0 (reuters) - carlos tevez sealed his move to ju... sports

1 if professional pride and strong defiance can ... sports

2 palermo, sicily — roberta vinci beat top-seede... sports

3 spain's big two soccer teams face a pair of it... sports

4 the argentine soccer club san lorenzo complete... sports

Next steps: Generate code with new_df

toggle_off View recommended plots New interactive sheet

new_df['label'].value_counts()

https://colab.research.google.com/drive/1LeuPtiOivZ1KaBBVycZ9tnzT8zX_J_Gx#scrollTo=eTa2tfw2UEpp&printMode=true 5/9
10/17/24, 6:49 PM HW 1.ipynb - Colab

count

label

sports 8639

politics 1451

business 1429

dtype: int64

mapping = {
'sports': [1, 0, 0],
'politics': [0, 1, 0],
'business': [0, 0, 1]
}

new_df['label'] = df['label'].map(mapping)
new_df.head()

text label

0 (reuters) - carlos tevez sealed his move to ju... [1, 0, 0]

1 if professional pride and strong defiance can ... [1, 0, 0]

2 palermo, sicily — roberta vinci beat top-seede... [1, 0, 0]

3 spain's big two soccer teams face a pair of it... [1, 0, 0]

4 the argentine soccer club san lorenzo complete... [1, 0, 0]

Next steps: Generate code with new_df

toggle_off View recommended plots New interactive sheet

# config

MAX_LEN = 64
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:89: UserWarning:
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
tokenizer_config.json: 100% 48.0/48.0 [00:00<00:00, 1.35kB/s]

vocab.txt: 100% 232k/232k [00:00<00:00, 1.40MB/s]

tokenizer.json: 100% 466k/466k [00:00<00:00, 1.90MB/s]

config.json: 100% 570/570 [00:00<00:00, 10.7kB/s]

/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_
warnings.warn(

class CustomDataset(Dataset):

def init(self, dataframe, tokenizer, max_len):

self.tokenizer = tokenizer
self.data = dataframe
self.text = dataframe.text
self.targets = self.data.label
self.max_len = max_len

def __len__(self):
return len(self.text)

def getitem(self, index):

text = str(self.text[index])
text = " ".join(text.split())

https://colab.research.google.com/drive/1LeuPtiOivZ1KaBBVycZ9tnzT8zX_J_Gx#scrollTo=eTa2tfw2UEpp&printMode=true 6/9
10/17/24, 6:49 PM HW 1.ipynb - Colab
inputs = self.tokenizer.encode_plus(
text,
None,
add_special_tokens=True,
max_length=self.max_len,
pad_to_max_length=True,
return_token_type_ids=True
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]

return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'targets': torch.tensor(self.targets[index], dtype=torch.float)
}

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=42)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print(f"Dataset: {new_df.shape}")
print(f"Train set: {train_dataset.shape}")
print(f"Test set: {test_dataset.shape}")

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)

testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

Dataset: (11519, 2)
Train set: (9215, 2)
Test set: (2304, 2)

train_params = {'batch_size': TRAIN_BATCH_SIZE,

'shuffle': True,
'num_workers': 0
}

test_params = {'batch_size': VALID_BATCH_SIZE,

'shuffle': True,
'num_workers': 0
}

training_loader = DataLoader(training_set, **train_params)

testing_loader = DataLoader(testing_set, **test_params)

class BERTClass(torch.nn.Module):
def __init__(self):
super(BERTClass, self).__init__()
self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
self.l2 = torch.nn.Dropout(0.3)
self.l3 = torch.nn.Linear(768, 3)

def forward(self, ids, mask, token_type_ids):

_, output_1= self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
output_2 = self.l2(output_1)
output = self.l3(output_2)
return output

model = BERTClass()
model.to(device)

https://colab.research.google.com/drive/1LeuPtiOivZ1KaBBVycZ9tnzT8zX_J_Gx#scrollTo=eTa2tfw2UEpp&printMode=true 7/9
10/17/24, 6:49 PM HW 1.ipynb - Colab

model.safetensors: 100% 440M/440M [00:03<00:00, 145MB/s]

BERTClass(
(l1): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(30522, 768, padding_idx=0)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0-11): 12 x BertLayer(
(attention): BertAttention(
(self): BertSdpaSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
(l2): Dropout(p=0.3, inplace=False)
(l3): Linear(in_features=768, out_features=3, bias=True)
)

def loss_fn(outputs, targets):

return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.Adam(params = model.parameters(), lr=LEARNING_RATE)

def train(epoch):
model.train()
for _,data in enumerate(training_loader, 0):
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(device, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.float)

outputs = model(ids, mask, token_type_ids)

optimizer.zero_grad()
loss = loss_fn(outputs, targets)
if _%5000==0:
print(f'Epoch: {epoch}, Loss: {loss.item()}')

optimizer.zero_grad()
loss.backward()
optimizer.step()

for epoch in range(EPOCHS):

train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to expli
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py:2870: FutureWarning: The `pad_to_max_length`
warnings.warn(
Epoch: 0, Loss: 0.6873428225517273

https://colab.research.google.com/drive/1LeuPtiOivZ1KaBBVycZ9tnzT8zX_J_Gx#scrollTo=eTa2tfw2UEpp&printMode=true 8/9
10/17/24, 6:49 PM HW 1.ipynb - Colab
Epoch: 1, Loss: 0.014567049220204353
Epoch: 2, Loss: 0.003150239121168852

def validation(epoch):
model.eval()
fin_targets=[]
fin_outputs=[]
with torch.no_grad():
for _, data in enumerate(testing_loader, 0):
ids = data['ids'].to(device, dtype = torch.long)
mask = data['mask'].to(device, dtype = torch.long)
token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
targets = data['targets'].to(device, dtype = torch.float)
outputs = model(ids, mask, token_type_ids)
fin_targets.extend(targets.cpu().detach().numpy().tolist())
fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
return fin_outputs, fin_targets

outputs, targets = validation(2)

outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')

print(f"BERT - Accuracy: {accuracy}, Macro-F1: {f1_score_macro}, Micro-F1: {f1_score_micro}")

BERT - Accuracy: 0.9739583333333334, Macro-F1: 0.944545675507178, Micro-F1: 0.9756944444444444

https://colab.research.google.com/drive/1LeuPtiOivZ1KaBBVycZ9tnzT8zX_J_Gx#scrollTo=eTa2tfw2UEpp&printMode=true 9/9
DSC 253 - Homework 1
Bag of Words Model:

To implement the Bag of Words (BoW) model, I imported several libraries:

● Pandas: for loading the CSV file and handling DataFrames.

● train_test_split: for splitting the dataset into training, validation, and test sets.
● re: for preprocessing and cleaning text data.
● CountVectorizer: for constructing the BoW representation.
● LogisticRegression: for training the classifier.
● accuracy_score and f1_score: for evaluating model performance.

Dataset Preparation

The dataset was divided into three subsets:

● Training set: 80% of the data (9,215 data points)

● Validation set: 10% of the data (1,152 data points)
● Test set: 10% of the data (1,152 data points)

Preprocessing

I implemented a preprocessing function that converts text to lowercase and removes

punctuation. This function tokenizes the cleaned text into individual words, preparing it for
vectorization.

Binary Bag of Words Model

For the binary BoW representation, I used CountVectorizer with the binary=True parameter. This
configuration creates a binary vector where:

● The value is set to 1 if a word is present in the document, regardless of its frequency.
● The value is 0 if the word is not present.

Model Performance

The model achieved the following performance metrics on the test set:

● Accuracy: 97.83%
● Macro-F1 Score: 94.02%
● Micro-F1 Score: 97.83%
Frequency Bag of Words Model:

For the frequency BoW representation, I used CountVectorizer without setting the binary
parameter. This configuration creates a frequency vector where the value represents the
number of times a word appears in the document.

The high accuracy and F1 scores indicate that the binary BoW model effectively captures the
presence or absence of words, which is sufficient for this classification task.

Model Performance
The model achieved the following performance metrics on the test set:
● Accuracy: 98.26%
● Macro-F1 Score: 95.36%
● Micro-F1 Score: 98.26%

TF-IDF Bag of Words Model:

For the TF-IDF representation, I used TfidfVectorizer. This configuration creates a vector where
the value represents the TF-IDF score of each word in the document, reflecting both its
frequency in the document and its importance across the entire corpus.

Model Performance

The model achieved the following performance metrics on the test set:

● Accuracy: 97.83%
● Macro-F1 Score: 94.41%
● Micro-F1 Score: 97.83%

The high accuracy and F1 scores indicate that the BoW model effectively captures word
occurrences, which enhances classification performance in this task.

GloVe-Based Text Classifier

Libraries and Tools

To implement the GloVe-based text classifier, I used the following libraries:

● Pandas: For loading the CSV files and handling DataFrames.

● gensim: For implementing Word2Vec functionality.
● LogisticRegression: From scikit-learn for training the classifier.
● accuracy_score and f1_score: From scikit-learn for evaluating model performance.
● re: For preprocessing and cleaning text data.
● NumPy: For numerical operations and vector manipulations.
GloVe Embedding and Document Vectorization

I loaded pre-trained GloVe vectors (100-dimensional) to create word embeddings. For each
document, I calculated the average of its word vectors to obtain a document vector. If no word
from the document was found in the GloVe model, a zero vector was assigned.

Model Evaluation

A Logistic Regression model was trained using the GloVe-based document vectors:

● Training: The model was trained with the training set (80% of the data).
● Validation: Performance was monitored using the validation set (10% of the data) to
fine-tune model parameters.
● Testing: The final performance was evaluated using the test set (10% of the data).

Model Performance

The model achieved the following performance metrics on the test set:

● Accuracy: 97.48%
● Macro-F1 Score: 93.51%
● Micro-F1 Score: 97.48%

The high accuracy and F1 scores indicate that the GloVe-based model effectively captures
semantic information, improving classification performance for this task.

Word2Vec-Based Text Classifier

Word2Vec Model (AG -> NYT)

I trained a Word2Vec model on the AG News dataset using gensim with the following
configuration:

● Vector size: 100 dimensions

● Window: 5 words
● Minimum count: 2
● Skip-gram model: sg=1

The trained Word2Vec model was then used to create document vectors for the NYT dataset by
averaging the word vectors of each document. If no word from a document was present in the
Word2Vec vocabulary, a zero vector was assigned.

Model Evaluation

A Logistic Regression model was trained and evaluated.

Model Performance

The model achieved the following performance metrics on the test set:

● Accuracy: 96.26%
● Macro-F1 Score: 90.42%
● Micro-F1 Score: 96.26%

Word2Vec Model (NYT -> NYT):

In this task, I trained a Word2Vec model on the NYT dataset and used it to create document
vectors for text classification. Below are the details of the implementation:

I trained the Word2Vec model using the NYT dataset's preprocessed text. The configuration
used for training included:

● Vector Size: 100 (dimensionality of the word vectors)

● Window Size: 5 (context window size)
● Minimum Count: 2 (words with frequency less than 2 are ignored)
● Skip-Gram (sg=1): enabled for better performance on smaller datasets.
● Workers: 4 (for parallel training)
● Seed: 42 (for reproducibility)

I created a function, get_document_vector_w2v_nyt, which computes the mean vector of

all word vectors in a document using the trained Word2Vec model. If no words in a document
are present in the vocabulary, a zero vector of size 100 is returned.

Model Evaluation

A Logistic Regression model was trained using the document vectors from the Word2Vec
model. The evaluation metrics were as follows:

● Accuracy: 96.96%
● Macro-F1 Score: 92.63%
● Micro-F1 Score: 96.96%

The results indicate that the Word2Vec model effectively captures the semantic meaning of
words, leading to high accuracy and F1 scores for the classification task.
What are the disadvantages of averaging word vectors for the document representation?
Describe an idea to overcome this. The document vectors should be formed using word vectors.

-> Averaging word vectors ignores the order in which words appear in a document. This can
lead to situations where sentences with completely different meanings (e.g., "The cat chased
the mouse" vs. "The mouse chased the cat") produces the same document vector.
Instead of averaging the word vectors equally, apply weights based on the positions of the
words. For example, you can assign higher weights to words that appear earlier or later in the
document, depending on the nature of the text.

BERT Model

To implement the BERT-based model, I imported several libraries:

● Pandas: for loading the dataset and handling DataFrames.

● train_test_split: for splitting the dataset into training and test sets.
● Torch: for creating and training the deep learning model.
● Transformers: for loading the pre-trained BERT model and tokenizer.
● accuracy_score and f1_score: for evaluating the model's performance.

Dataset Preparation:

The dataset was split into two subsets:

● Training set: 80% of the data.

● Test set: 20% of the data.

Config:

MAX_LEN = 64
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 3
LEARNING_RATE = 1e-05

Preprocessing:

I implemented a preprocessing function that uses the BERT tokenizer to:

● Tokenize the input text into tokens that BERT can process.
● Pad the sequences to ensure uniform length.
● Create attention masks to distinguish between real tokens and padding.

The pre-trained BERT model was fine-tuned for a binary classification task. A linear layer was
added on top of BERT’s output to classify the inputs.
Model Training:

The model was trained using:

● Epochs: 3 epochs for training.

● Loss function: Binary Cross-Entropy Loss.
● Optimizer: Adam optimizer with weight decay for regularization.

Model Performance:

The model achieved the following performance metrics on the test set:

● Accuracy: 96.39%
● Macro-F1 Score: 94.45%
● Micro-F1 Score: 97.57%

These metrics indicate that the BERT model performs well on the text classification task.

ML Cheatsheet
No ratings yet
ML Cheatsheet
4 pages
Linear and Logistic Regression
No ratings yet
Linear and Logistic Regression
6 pages
Harrison Kinsley, Daniel Kukieła - Neural Networks From Scratch in Python (2020) - 1-30
No ratings yet
Harrison Kinsley, Daniel Kukieła - Neural Networks From Scratch in Python (2020) - 1-30
30 pages
Udacity Machine Learning Analysis Supervised Learning
100% (1)
Udacity Machine Learning Analysis Supervised Learning
504 pages
Efficient Python Tricks and Tools For Data Scientists - by Khuyen Tran
No ratings yet
Efficient Python Tricks and Tools For Data Scientists - by Khuyen Tran
20 pages
Personalized Cancer Diagnosis
No ratings yet
Personalized Cancer Diagnosis
100 pages
Ad3461-ML Manual
No ratings yet
Ad3461-ML Manual
27 pages
Hatespeech Code Ipynb
No ratings yet
Hatespeech Code Ipynb
31 pages
Approachin190808095205 PDF
No ratings yet
Approachin190808095205 PDF
112 pages
Machine Learning Lab
No ratings yet
Machine Learning Lab
43 pages
Null 0
No ratings yet
Null 0
6 pages
ML Lab Manual
No ratings yet
ML Lab Manual
13 pages
Machine Learning With Scikit Learn Strata 2015
No ratings yet
Machine Learning With Scikit Learn Strata 2015
72 pages
School of Engineering: Lab Manual On Machine Learning Lab
No ratings yet
School of Engineering: Lab Manual On Machine Learning Lab
23 pages
MACHINE LEARNING Manual
No ratings yet
MACHINE LEARNING Manual
36 pages
EE2211 CheatSheet
No ratings yet
EE2211 CheatSheet
15 pages
Topic Classifierby David Caleb
No ratings yet
Topic Classifierby David Caleb
7 pages
ML Manual
No ratings yet
ML Manual
34 pages
M.tech Syllabus
No ratings yet
M.tech Syllabus
32 pages
Adobe Scan 30-Sept-2024
No ratings yet
Adobe Scan 30-Sept-2024
7 pages
Diabetic Classification Using Machine Learning
No ratings yet
Diabetic Classification Using Machine Learning
10 pages
Operations Research Module
100% (2)
Operations Research Module
206 pages
DL Question Bank
No ratings yet
DL Question Bank
5 pages
Home Work
No ratings yet
Home Work
12 pages
Machine Learning Practical File
No ratings yet
Machine Learning Practical File
31 pages
Machine Learning - Lab Manual
No ratings yet
Machine Learning - Lab Manual
35 pages
ADS - Phase 3
No ratings yet
ADS - Phase 3
34 pages
CCC
No ratings yet
CCC
25 pages
ML File
No ratings yet
ML File
13 pages
Scikit Learn
No ratings yet
Scikit Learn
25 pages
Advance Machine Learning
No ratings yet
Advance Machine Learning
28 pages
Deep Learning and Machine Learning: Lab Explanation
No ratings yet
Deep Learning and Machine Learning: Lab Explanation
34 pages
1
No ratings yet
1
13 pages
ML Lab Manual
No ratings yet
ML Lab Manual
14 pages
ML NEW Final Format
No ratings yet
ML NEW Final Format
37 pages
Shobit Sharma (2124399) ML Lab File PDF
No ratings yet
Shobit Sharma (2124399) ML Lab File PDF
19 pages
(P) Program AIO
No ratings yet
(P) Program AIO
22 pages
WDM - Week - I
No ratings yet
WDM - Week - I
24 pages
ML Lab Report
No ratings yet
ML Lab Report
8 pages
ML Lab Manual PDF
No ratings yet
ML Lab Manual PDF
9 pages
Project Report (ML PRO)
No ratings yet
Project Report (ML PRO)
71 pages
AI ML - Cycle 2 Programs
No ratings yet
AI ML - Cycle 2 Programs
15 pages
ML Lab P-1
No ratings yet
ML Lab P-1
10 pages
Document 4
No ratings yet
Document 4
3 pages
Machine File
No ratings yet
Machine File
27 pages
Approaching (Almost) Any Machine Learning Problem - Abhishek Thakur - No Free Hunch
No ratings yet
Approaching (Almost) Any Machine Learning Problem - Abhishek Thakur - No Free Hunch
22 pages
MlLabManualdocx 2024 09 04 22 02 58
No ratings yet
MlLabManualdocx 2024 09 04 22 02 58
19 pages
Data Preprocessing
No ratings yet
Data Preprocessing
9 pages
Linear Regression (Code)
No ratings yet
Linear Regression (Code)
9 pages
AIML
No ratings yet
AIML
12 pages
Machine Learning Lab (17CSL76)
No ratings yet
Machine Learning Lab (17CSL76)
48 pages
Unit 2 Data Preprocessing
No ratings yet
Unit 2 Data Preprocessing
25 pages
C2W3 Lab 01 Model Evaluation and Selection
No ratings yet
C2W3 Lab 01 Model Evaluation and Selection
21 pages
C2W3 Lab 01 Model Evaluation and Selection
No ratings yet
C2W3 Lab 01 Model Evaluation and Selection
21 pages
ML Record Print
No ratings yet
ML Record Print
20 pages
Ieee 12
No ratings yet
Ieee 12
15 pages
ML Lab Manual
No ratings yet
ML Lab Manual
12 pages
Import Pandas As PD DF PD - Read - CSV ("Titanic - Train - CSV") DF - Head
No ratings yet
Import Pandas As PD DF PD - Read - CSV ("Titanic - Train - CSV") DF - Head
20 pages
ML Practicals
No ratings yet
ML Practicals
11 pages
Supervised Learning For Data Science...
No ratings yet
Supervised Learning For Data Science...
14 pages
FND Imp Points
No ratings yet
FND Imp Points
6 pages
Unit2 ML Programs
No ratings yet
Unit2 ML Programs
7 pages
ML
No ratings yet
ML
8 pages
Using Machine Learning Algorithms To Detect Milk Quality (#1180639) - 2673216
No ratings yet
Using Machine Learning Algorithms To Detect Milk Quality (#1180639) - 2673216
12 pages
ML Lab
No ratings yet
ML Lab
7 pages
Bridge The Gap: From Data To Insights
No ratings yet
Bridge The Gap: From Data To Insights
16 pages
ML Model QP
No ratings yet
ML Model QP
2 pages
Linearregression SVM
No ratings yet
Linearregression SVM
3 pages
Using Decision Tree Analysis To Identify The Determinants of Residents'
No ratings yet
Using Decision Tree Analysis To Identify The Determinants of Residents'
13 pages
Unit Iv
No ratings yet
Unit Iv
12 pages
Machine Learning Laboratory Manual
No ratings yet
Machine Learning Laboratory Manual
11 pages
15CSL76
No ratings yet
15CSL76
35 pages
Neural Network
No ratings yet
Neural Network
12 pages
Cyber Empower
No ratings yet
Cyber Empower
7 pages
Midterm 2003
No ratings yet
Midterm 2003
11 pages
Crime Predictionand Analysis
No ratings yet
Crime Predictionand Analysis
7 pages
IPS Academy, Institute of Engineering & Science
No ratings yet
IPS Academy, Institute of Engineering & Science
19 pages
Wehel Hadi (8686807) Thesis Applied Data Science
No ratings yet
Wehel Hadi (8686807) Thesis Applied Data Science
46 pages
Chapter14 Big Data Analytics For Intrusion Detection An Overview Final
No ratings yet
Chapter14 Big Data Analytics For Intrusion Detection An Overview Final
26 pages
Comparation Analysis of Ensemble Technique With Boosting (Xgboost) and Bagging (Randomforest) For Classify Splice Junction Dna Sequence Category
No ratings yet
Comparation Analysis of Ensemble Technique With Boosting (Xgboost) and Bagging (Randomforest) For Classify Splice Junction Dna Sequence Category
10 pages
Machine Learning Lecture - 2 and Lecture - 3
No ratings yet
Machine Learning Lecture - 2 and Lecture - 3
59 pages
Final RRL 7
No ratings yet
Final RRL 7
8 pages
KNN Algo
No ratings yet
KNN Algo
7 pages
Lecture 1-Unit 3.3
No ratings yet
Lecture 1-Unit 3.3
3 pages
Framework For Network-Level Pavement Condition Assessment Using Remote Sensing Data Mining
No ratings yet
Framework For Network-Level Pavement Condition Assessment Using Remote Sensing Data Mining
32 pages
Analysis of Data Mining Classification With Decision Tree Technique
No ratings yet
Analysis of Data Mining Classification With Decision Tree Technique
7 pages
Data Science - Glossary
100% (1)
Data Science - Glossary
12 pages
Bab 7
No ratings yet
Bab 7
3 pages
Proof: Engineering Science and Technology, An International Journal
No ratings yet
Proof: Engineering Science and Technology, An International Journal
8 pages
Computer Engineering Laboratory Solution Primer
From Everand
Computer Engineering Laboratory Solution Primer
Karan Bhandari
No ratings yet

DSC 253 Homework 1

Uploaded by

DSC 253 Homework 1

Uploaded by

10/17/24, 6:49 PM HW 1.

# Load the datasets

# Shuffle and split the NYT dataset

print(f"Training size: {len(train_data)}")

Training size: 9215

!pip install nltk

Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (3.8.1)

# Function to train and evaluate a model

# Predict on test set

return accuracy, macro_f1, micro_f1

# Binary BoW - 1 (a)

binary_results = train_evaluate(X_train_binary, train_data['label'], X_val_binary, val_data['label'], X_test_binary, test_data['

Binary BoW - Accuracy: 0.9782986111111112, Macro-F1: 0.9401635276063338, Micro-F1: 0.9782986111111112

# Frequency BoW - 1 (b)

freq_results = train_evaluate(X_train_freq, train_data['label'], X_val_freq, val_data['label'], X_test_freq, test_data['label'])

Frequency BoW - Accuracy: 0.9826388888888888, Macro-F1: 0.9535850489937898, Micro-F1: 0.9826388888888888

# TF-IDF BoW - 1 (c)

tfidf_results = train_evaluate(X_train_tfidf, train_data['label'], X_val_tfidf, val_data['label'], X_test_tfidf, test_data['labe

TF-IDF BoW - Accuracy: 0.9782986111111112, Macro-F1: 0.9440661087882298, Micro-F1: 0.9782986111111112

# Load the datasets

# Display the preprocessed data

# Load pre-trained GloVe vectors (100-dimensional)

# Function to get the document vector using GloVe

# Create document vectors for NYT dataset using GloVe

Loading Glove Model

from sklearn.model_selection import train_test_split

# Shuffle and split the NYT dataset

# Extract features and labels

# Function to train and evaluate a model

# Predict on test set

return accuracy, macro_f1, micro_f1

# Evaluate GloVe representation

GloVe - Accuracy: 0.9748263888888888, Macro-F1: 0.9350793298157348, Micro-F1: 0.9748263888888888

# Train Word2Vec model on AG News dataset

# Function to get the document vector using AG News Word2Vec model

from sklearn.model_selection import train_test_split

# Shuffle and split the NYT dataset

# Function to train and evaluate a model

# Predict on test set

return accuracy, macro_f1, micro_f1

# Evaluate Word2Vec (AG News) representation

Word2Vec (AG News) - Accuracy: 0.9626736111111112, Macro-F1: 0.904249698978289, Micro-F1: 0.9626736111111112

# Train Word2Vec model on NYT dataset

# Function to get the document vector using NYT Word2Vec model

from sklearn.model_selection import train_test_split

# Shuffle and split the NYT dataset

# Function to train and evaluate a model

# Predict on test set

return accuracy, macro_f1, micro_f1

# Evaluate Word2Vec (NYT) representation

Word2Vec (NYT) - Accuracy: 0.9696180555555556, Macro-F1: 0.9263986085762167, Micro-F1: 0.9696180555555556

!pip install -q transformers

0 (reuters) - carlos tevez sealed his move to ju... sports

1 if professional pride and strong defiance can ... sports

2 palermo, sicily — roberta vinci beat top-seede... sports

3 spain's big two soccer teams face a pair of it... sports

4 the argentine soccer club san lorenzo complete... sports

Next steps: Generate code with new_df

0 (reuters) - carlos tevez sealed his move to ju... [1, 0, 0]

1 if professional pride and strong defiance can ... [1, 0, 0]

2 palermo, sicily — roberta vinci beat top-seede... [1, 0, 0]

3 spain's big two soccer teams face a pair of it... [1, 0, 0]

4 the argentine soccer club san lorenzo complete... [1, 0, 0]

Next steps: Generate code with new_df

vocab.txt: 100% 232k/232k [00:00<00:00, 1.40MB/s]

tokenizer.json: 100% 466k/466k [00:00<00:00, 1.90MB/s]

config.json: 100% 570/570 [00:00<00:00, 10.7kB/s]

def __init__(self, dataframe, tokenizer, max_len):

def __getitem__(self, index):

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,

test_params = {'batch_size': VALID_BATCH_SIZE,

training_loader = DataLoader(training_set, **train_params)

def forward(self, ids, mask, token_type_ids):

model.safetensors: 100% 440M/440M [00:03<00:00, 145MB/s]

def loss_fn(outputs, targets):

def init(self, dataframe, tokenizer, max_len):

def getitem(self, index):