Bayesian Methods
Bayesian Methods
Bayesian Methods
def readFiles(path):
for root, dirnames, filenames in os.walk(path):
for filename in filenames:
path = os.path.join(root, filename)
inBody = False
lines = []
f = io.open(path, 'r', encoding='latin1')
for line in f:
if inBody:
lines.append(line)
elif line == '\n':
inBody = True
f.close()
message = '\n'.join(lines)
yield path, message
1
def dataFrameFromDirectory(path, classification):
rows = []
index = []
for filename, message in readFiles(path):
rows.append({'message': message, 'class': classification})
index.append(filename)
[14]: data.head()
[14]: message \
emails/spam/00217.43b4ef3d9c56cf42be9c37b546a19e78 <html><xbody>\n\n<hr width =
"100%">\n\n<cente…
emails/spam/00328.73c1a9f83d3b1247522c26eb6d74c215 \n\n Socijalisticka
partija Srbije, pred…
emails/spam/00408.22230b84aee00e439ae1938e025d5005 \n\n<html>\n\n<body
bgcolor="#FFFFFF">\n\n<TAB…
emails/spam/00383.1aa9a8211d1de540d6e3852e230e5a9d
<html>\n\n<head>\n\n<title>FREE* Liz Claiborne…
emails/spam/00390.ce19abc8034db9e6b435d494a91db87a This message is in MIME
format. Since your mai…
class
emails/spam/00217.43b4ef3d9c56cf42be9c37b546a19e78 spam
emails/spam/00328.73c1a9f83d3b1247522c26eb6d74c215 spam
emails/spam/00408.22230b84aee00e439ae1938e025d5005 spam
emails/spam/00383.1aa9a8211d1de540d6e3852e230e5a9d spam
emails/spam/00390.ce19abc8034db9e6b435d494a91db87a spam
classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)
[8]: MultinomialNB()
[11]: examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
2
predictions
Acima é possível ver que o modelo consegue classificar as frases acima de forma correta.