0% found this document useful (0 votes)
32 views6 pages

Lab 78

fgfgfggf
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
32 views6 pages

Lab 78

fgfgfggf
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 6

Artificial intelligence Course: IT153IU

International University – VNU HCM Date: May 24th 2024


Dr. Nguyen Trung Ky Time: 2 weeks

Full name: Lê Hồng Quang

Student ID: ITITIU20286

Lab#7&8/Assignment#7&8: NaiveBayes
Exercise 1: Create a class NaiveBayesFilter, with an __init__() method
as defined in NaiveBayesFilter. Add a fit() method which takes as
arguments X, the training data, and y the training labels. In this case,
X is a pandas.Series containing strings that are SMS messages. For
each message in X count the number of occurrences of each word
and record this information in a DataFrame. The final form of the
DataFrame should have a column for each unique word that appears
in any message of X as well as a Label column, you may also include
any other columns you think you’ll need. Each row of the DataFrame
corresponds to a row of X, and records the number of occurrences of
each word in a given message. The Label column records the label of
the message. Save this DataFrame as self.data.
class NaiveBayesFilter:
def __init__(self):
self.data = []
self.vocabulary = [] # returns tuple of unique words
self.proba = []
self.prob = []
self.p_spam = 0 # Probability of Spam
self.p_ham = 0 # Probability of Ham
# Initiate parameters
self.parameters_spam = {unique_word: 0 for unique_word
in self.vocabulary}
self.parameters_ham = {unique_word: 0 for unique_word in
self.vocabulary}

def fit(self, X, y):


for sms in X:
for word in sms:
self.vocabulary.append(word)

self.vocabulary = list(set(self.vocabulary))

word_counts_per_sms = {unique_word: [0] * len(X) for


unique_word in self.vocabulary}
for index, sms in enumerate(X):
for word in sms:
word_counts_per_sms[word][index] += 1

word_counts = pd.DataFrame(word_counts_per_sms)
self.data = pd.concat([y, X, word_counts], axis=1)

print(self.data)

spam_messages = self.data[self.data['Label'] == 'spam']


ham_messages = self.data[self.data['Label'] == 'ham']

# P(Spam) and P(Ham)


self.p_spam = len(spam_messages) / len(self.data)
self.p_ham = len(ham_messages) / len(self.data)

# N_Spam
n_words_per_spam_message =
spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()

# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()

# N_Vocabulary
n_vocabulary = len(self.vocabulary)

# Laplace smoothing
alpha = 1

print('Number of words in spam messages is: ' +


str(n_spam) + '\n')
print('Number of words in ham messages is: ' +
str(n_ham) + '\n')
print('Number of unique words are: ' +
str(n_vocabulary))

# Calculate parameters
for word in self.vocabulary:
n_word_given_spam = spam_messages[word].sum() #
spam_messages already defined
p_word_given_spam = (n_word_given_spam + alpha) /
(n_spam + alpha * n_vocabulary)
self.parameters_spam[word] = p_word_given_spam

n_word_given_ham = ham_messages[word].sum() #
ham_messages already defined
p_word_given_ham = (n_word_given_ham + alpha) /
(n_ham + alpha * n_vocabulary)
self.parameters_ham[word] = p_word_given_ham
return self.data

Exercise 2: Implement the predict_proba() method in your naïve Bayes


classifier. This should take as an argument X, the data that needs to
be classified. For each message x in X compute P(S|x) and P(H|x)
using Equations 1.3, 1.4, and 1.5. The method should return a (Nx2)
list, where N is the length of X. The first column corresponds to P(C =
H|x), and the second to P(C = S|x).
def predict_proba(self, X):
# Isolating spam and ham messages first
self.proba_dict = {sentences: [1] * 2 for sentences in
range(len(X))}
for index, sms in enumerate(X):
for word in sms:
if word in self.vocabulary:
self.proba_dict[index][0] *=
self.parameters_ham[word]
self.proba_dict[index][1] *=
self.parameters_spam[word]
else:
self.proba_dict[index][0] *= 1
self.proba_dict[index][1] *= 1

# print(self.proba_dict)

proba_table = pd.DataFrame(self.proba_dict, index=['ham',


'spam']).T

predict_list = []
for i in range(len(proba_table)):
if proba_table.loc[i, 'spam'] > proba_table.loc[i,
'ham']:
predict_list.append('spam')
else:
predict_list.append('ham')
predict_table = pd.DataFrame(predict_list,
columns=['predict'])
# print(predict_list)

self.proba = pd.concat([X, proba_table, predict_table],


axis=1)
return self.proba

Exercise 3: Implement the predict() method in your naïve Bayes


classifier. This should take as an argument X, the data that needs to
be classified. Implement equation 1.2 and return a list of labels that
predicts each message in X. For example:

#create the filter

NB = NaiveBayesFilter()

#fit the filter with train data

NB.fit(X_train, y_train)

#test the predict function with five data points in test data

NB.predict(X_test[500:505])

Output: ['ham', 'ham', 'ham', 'ham', 'spam']


def predict(self, X):
predicted_labels = []

for sms in X:
p_spam_given_sms = np.log(self.p_spam) # Log
probability of spam
p_ham_given_sms = np.log(self.p_ham) # Log probability
of ham

for word in sms:


if word in self.parameters_spam:
p_spam_given_sms +=
np.log(self.parameters_spam[word])
if word in self.parameters_ham:
p_ham_given_sms +=
np.log(self.parameters_ham[word])

if p_spam_given_sms > p_ham_given_sms:


predicted_labels.append('spam')
else:
predicted_labels.append('ham')

return predicted_labels

Exercise 4: Implement the score() method in your naïve Bayes


classifier. This should take two arguments as a list of predicted labels
(returns from predict() method) and a list of true labels in X and return
the matched labels between them which refer as recall metric.

For example:

#test the predict function with five data points in test data

predict_labels = NB.predict(X_test[500:505])

#calculate the score

recall = NB.score(y_test[500:505], predict_labels)

print("recall of NB: ", recall)

Output: recall of NB: 0.8


def score(self, y, predict_label):
A = [[0, 0],
[0, 0]]
for i in range(len(y)):
if (predict_label[i] == 'ham') & (y[i] == 'ham'):
A[0][0] += 1
elif (predict_label[i] == 'spam') & (y[i] == 'spam'):
A[1][1] += 1
elif (predict_label[i] == 'spam') & (y[i] == 'ham'):
A[1][0] += 1
else:
A[0][1] += 1
for line in A:
print(' '.join(map(str, line)))

precision = A[0][0] / (A[0][0] + A[1][0])


recall = A[0][0] / (A[0][0] + A[0][1])
F1 = (2 * precision * recall) / (precision + recall)

accuracy = (A[0][0] + A[1][1]) / len(y)


print('Naive Bayes accuracy: ', accuracy * 100, '%')
return precision, recall, F1

You might also like