Artificial intelligence Course: IT153IU
International University – VNU HCM Date: May 24th 2024
Dr. Nguyen Trung Ky Time: 2 weeks
Full name: Lê Hồng Quang
Student ID: ITITIU20286
Lab#7&8/Assignment#7&8: NaiveBayes
Exercise 1: Create a class NaiveBayesFilter, with an __init__() method
as defined in NaiveBayesFilter. Add a fit() method which takes as
arguments X, the training data, and y the training labels. In this case,
X is a pandas.Series containing strings that are SMS messages. For
each message in X count the number of occurrences of each word
and record this information in a DataFrame. The final form of the
DataFrame should have a column for each unique word that appears
in any message of X as well as a Label column, you may also include
any other columns you think you’ll need. Each row of the DataFrame
corresponds to a row of X, and records the number of occurrences of
each word in a given message. The Label column records the label of
the message. Save this DataFrame as self.data.
class NaiveBayesFilter:
def __init__(self):
self.data = []
self.vocabulary = [] # returns tuple of unique words
self.proba = []
self.prob = []
self.p_spam = 0 # Probability of Spam
self.p_ham = 0 # Probability of Ham
# Initiate parameters
self.parameters_spam = {unique_word: 0 for unique_word
in self.vocabulary}
self.parameters_ham = {unique_word: 0 for unique_word in
self.vocabulary}
def fit(self, X, y):
for sms in X:
for word in sms:
self.vocabulary.append(word)
self.vocabulary = list(set(self.vocabulary))
word_counts_per_sms = {unique_word: [0] * len(X) for
unique_word in self.vocabulary}
for index, sms in enumerate(X):
for word in sms:
word_counts_per_sms[word][index] += 1
word_counts = pd.DataFrame(word_counts_per_sms)
self.data = pd.concat([y, X, word_counts], axis=1)
print(self.data)
spam_messages = self.data[self.data['Label'] == 'spam']
ham_messages = self.data[self.data['Label'] == 'ham']
# P(Spam) and P(Ham)
self.p_spam = len(spam_messages) / len(self.data)
self.p_ham = len(ham_messages) / len(self.data)
# N_Spam
n_words_per_spam_message =
spam_messages['SMS'].apply(len)
n_spam = n_words_per_spam_message.sum()
# N_Ham
n_words_per_ham_message = ham_messages['SMS'].apply(len)
n_ham = n_words_per_ham_message.sum()
# N_Vocabulary
n_vocabulary = len(self.vocabulary)
# Laplace smoothing
alpha = 1
print('Number of words in spam messages is: ' +
str(n_spam) + '\n')
print('Number of words in ham messages is: ' +
str(n_ham) + '\n')
print('Number of unique words are: ' +
str(n_vocabulary))
# Calculate parameters
for word in self.vocabulary:
n_word_given_spam = spam_messages[word].sum() #
spam_messages already defined
p_word_given_spam = (n_word_given_spam + alpha) /
(n_spam + alpha * n_vocabulary)
self.parameters_spam[word] = p_word_given_spam
n_word_given_ham = ham_messages[word].sum() #
ham_messages already defined
p_word_given_ham = (n_word_given_ham + alpha) /
(n_ham + alpha * n_vocabulary)
self.parameters_ham[word] = p_word_given_ham
return self.data
Exercise 2: Implement the predict_proba() method in your naïve Bayes
classifier. This should take as an argument X, the data that needs to
be classified. For each message x in X compute P(S|x) and P(H|x)
using Equations 1.3, 1.4, and 1.5. The method should return a (Nx2)
list, where N is the length of X. The first column corresponds to P(C =
H|x), and the second to P(C = S|x).
def predict_proba(self, X):
# Isolating spam and ham messages first
self.proba_dict = {sentences: [1] * 2 for sentences in
range(len(X))}
for index, sms in enumerate(X):
for word in sms:
if word in self.vocabulary:
self.proba_dict[index][0] *=
self.parameters_ham[word]
self.proba_dict[index][1] *=
self.parameters_spam[word]
else:
self.proba_dict[index][0] *= 1
self.proba_dict[index][1] *= 1
# print(self.proba_dict)
proba_table = pd.DataFrame(self.proba_dict, index=['ham',
'spam']).T
predict_list = []
for i in range(len(proba_table)):
if proba_table.loc[i, 'spam'] > proba_table.loc[i,
'ham']:
predict_list.append('spam')
else:
predict_list.append('ham')
predict_table = pd.DataFrame(predict_list,
columns=['predict'])
# print(predict_list)
self.proba = pd.concat([X, proba_table, predict_table],
axis=1)
return self.proba
Exercise 3: Implement the predict() method in your naïve Bayes
classifier. This should take as an argument X, the data that needs to
be classified. Implement equation 1.2 and return a list of labels that
predicts each message in X. For example:
#create the filter
NB = NaiveBayesFilter()
#fit the filter with train data
NB.fit(X_train, y_train)
#test the predict function with five data points in test data
NB.predict(X_test[500:505])
Output: ['ham', 'ham', 'ham', 'ham', 'spam']
def predict(self, X):
predicted_labels = []
for sms in X:
p_spam_given_sms = np.log(self.p_spam) # Log
probability of spam
p_ham_given_sms = np.log(self.p_ham) # Log probability
of ham
for word in sms:
if word in self.parameters_spam:
p_spam_given_sms +=
np.log(self.parameters_spam[word])
if word in self.parameters_ham:
p_ham_given_sms +=
np.log(self.parameters_ham[word])
if p_spam_given_sms > p_ham_given_sms:
predicted_labels.append('spam')
else:
predicted_labels.append('ham')
return predicted_labels
Exercise 4: Implement the score() method in your naïve Bayes
classifier. This should take two arguments as a list of predicted labels
(returns from predict() method) and a list of true labels in X and return
the matched labels between them which refer as recall metric.
For example:
#test the predict function with five data points in test data
predict_labels = NB.predict(X_test[500:505])
#calculate the score
recall = NB.score(y_test[500:505], predict_labels)
print("recall of NB: ", recall)
Output: recall of NB: 0.8
def score(self, y, predict_label):
A = [[0, 0],
[0, 0]]
for i in range(len(y)):
if (predict_label[i] == 'ham') & (y[i] == 'ham'):
A[0][0] += 1
elif (predict_label[i] == 'spam') & (y[i] == 'spam'):
A[1][1] += 1
elif (predict_label[i] == 'spam') & (y[i] == 'ham'):
A[1][0] += 1
else:
A[0][1] += 1
for line in A:
print(' '.join(map(str, line)))
precision = A[0][0] / (A[0][0] + A[1][0])
recall = A[0][0] / (A[0][0] + A[0][1])
F1 = (2 * precision * recall) / (precision + recall)
accuracy = (A[0][0] + A[1][1]) / len(y)
print('Naive Bayes accuracy: ', accuracy * 100, '%')
return precision, recall, F1