0% found this document useful (0 votes)
31 views21 pages

NLP Tushar

The document discusses preparing bag of word and TF-IDF models in Python. It shows code to: 1) Clean text data by removing punctuation and making words lowercase 2) Create a vocabulary set of unique words from multiple documents 3) Calculate bag of words representations by counting word frequencies in each document 4) Use scikit-learn's CountVectorizer to vectorize text into feature matrices 5) Calculate TF-IDF weights using scikit-learn's TfidfVectorizer

Uploaded by

Yash Amin
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
31 views21 pages

NLP Tushar

The document discusses preparing bag of word and TF-IDF models in Python. It shows code to: 1) Clean text data by removing punctuation and making words lowercase 2) Create a vocabulary set of unique words from multiple documents 3) Calculate bag of words representations by counting word frequencies in each document 4) Use scikit-learn's CountVectorizer to vectorize text into feature matrices 5) Calculate TF-IDF weights using scikit-learn's TfidfVectorizer

Uploaded by

Yash Amin
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 21

11. Write a python program to prepare a Bag of word Model.

Code :-

import pandas as pd

import numpy as np

import collections
import re

doc1 = "Game of Thrones is an amazing tv series"

doc2 = "Game of Thrones is best tv series"


doc3 = "Game of Thrones is so great"

#Remove Punctuation

l_doc1 = re.sub(r"[^a-zA-Z0-9]"," ",doc1.lower()).split()

l_doc2 = re.sub(r"[^a-zA-Z0-9]"," ",doc2.lower()).split()


l_doc3 = re.sub(r"[^a-zA-Z0-9]"," ",doc3.lower()).split()

#After we achieve the vocabulary, or wordset, which is composed of the unique


words founds in the three reviews

wordset12 = np.union1d(l_doc1,l_doc2)

wordset = np.union1d(wordset12,l_doc3)
print(wordset)

Tushar Parikh 21084341003


14
def calculateBOW(wordset,l_doc) :

tf_diz = dict.fromkeys(wordset,0)

for word in l_doc :

tf_diz[word] = l_doc.count(word)
return tf_diz

#We can finally obtain the bag of words representatives for the reviews. In the
end, we obtain a dataframe, where each row corresponds to the extracted
features of each document

bow1 = calculateBOW(wordset,l_doc1)

bow2 = calculateBOW(wordset,l_doc2)

bow3 = calculateBOW(wordset,l_doc3)

df_bow = pd.DataFrame([bow1,bow2,bow3])
df_bow.head()

from sklearn.feature_extraction.text import CountVectorizer


vectorizer = CountVectorizer()

Tushar Parikh 21084341003


15
x = vectorizer.fit_transform([doc1,doc2,doc3])
print(vectorizer.get_feature_names())

df_bow_sklearn = pd.DataFrame(x.toarray(),columns = vectorizer.get_feature_names())

df_bow_sklearn.head()

vectorizer = CountVectorizer(stop_words = "english")

x = vectorizer.fit_transform([doc1,doc2,doc3])
print(vectorizer.get_feature_names())

df_bow_sklearn=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())

df_bow_sklearn.head()

Tushar Parikh 21084341003


16
vectorizer=CountVectorizer(stop_words="english",ngram_range=(2,2))

x=vectorizer.fit_transform([doc1,doc2,doc3])
print(vectorizer.get_feature_names())

df_bow_sklearn=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())

df_bow_sklearn.head()

import pandas as pd
dataset = pd.read_csv(r”c:\Users\HP\data.csv”,encoding=”ISO-8859-1”)
dataset.head()

Tushar Parikh 21084341003


17
import re
import nltk

from nltk.steam.porter import PorterStemmer


stemmer = PorterStemmer()

#spell correction
from nltk.corpus import stopwords
Data = []

for i in range(dataset.shape[0]) :
sms = dataset.iloc[I,1]
#remove non alphabetic characters
sms = re.sub(^[A-Za-z],’ ‘ ,sms)

#make words lowercase, because Go and go will be considered as two


words
sms = sms.lover()

#tokenising
tokenized_sms = wt(sms)

Tushar Parikh 21084341003


18
#remove stop words and stemming

sms_processed = []
for word in tokenized_sms :
if word not in set(stopwords.words(‘english’)) :
sms_processed.append(stemmer.stem(word))

sms_text = “ “.join(sms_processed)
data.append(sms_text)

# creating the feature matrix


from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000)
x = matrix.fit_transform(data).toarray()
y = dataset.iloc[:,0]

# split train and test data


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y)

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

Tushar Parikh 21084341003


19
# predict class
y_pred = classifier.predict(x_test)

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report,
accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

accuracy

Tushar Parikh 21084341003


20
12. Write a python program to prepare a TF-IDF Model.
Code :-

from sklearn.feature_extraction.text import TfidfVectorizer

d0 = "The car is driven on the road"


d1 = "The truck is driven on the highway"
d2 = "The bike is run on road"

string = [d0,d1,d2]

tfidf = TfidfVectorizer()

result = tfidf.fit_transform(string)
result

print("\nword indices : ")


print(tfidf.vocabulary_)

Tushar Parikh 21084341003


21
print("\nidf values : ")
for ele1,ele2 in zip(tfidf.get_feature_names(),tfidf.idf_) :
print(ele1,":",ele2)

print("\ntf.idf values : ")


print(result)

Tushar Parikh 21084341003


22
print("\ntf.idf values i matrix form : ")
print(result.toarray())

Tushar Parikh 21084341003


23
13. Write a python program to prepare a CountVectorizer Model.
Code :-
from sklearn.feature_extraction.text import CountVectorizer

# To create a Count Vectorizer, we simply need to instantiate one.


# There are special parameters we can set here when making the vectorizer,
but
# for the most basic example, it is not needed

vectorizer = CountVectorizer()

# For our text, we are going to take some text form our previous blog post
# about count vectorization

sample_text = ["One of the most basic ways we can numerically represent


words "
"is through the one-hot encoding method (also sometimes called "
"count vectorizing)."]

# To actually create the vectorizer, we simply need to call fit on the text
# data that we wish to fix

vectorizer.fit(sample_text)

Tushar Parikh 21084341003


24
# Now, we can inspect how our vectorizer vectorized the text
# This will print out a list of words used, and their index in the vectors
print("Vacabulary : ")
print(vectorizer.vocabulary_)

# If we would like to actually create a vector, we can do so by passing the


# text into the vectorizer to get back counts
vector = vectorizer.transform(sample_text)

# Our final vector :


print("Full vector : ")
print(vector.toarray())

# Or if we wanted to get the vectore for one word :


print("Hot vector : ")
print(vectorizer.transform(['hit']).toarray())

#or if we wanted to get multiple vectors at once to build matrices


print("Hot and One : ")
print(vectorizer.transform(['hot','one']).toarray())

# We could also do the whole thing at once with the fit_transform method :
print('One swoop : ')
new_text = ["Today is the day that I do the thing today, today"]
new_vectorizer = CountVectorizer()

Tushar Parikh 21084341003


25
print(new_vectorizer.fit_transform(new_text).toarray())

Tushar Parikh 21084341003


26
14. Write a python program to perform Text Classification with NLTK using
Naive Bayes Classifier.
Code :-
import numpy as np
import pandas as pd

df = pd.read_csv("C:\\Users\\Admin\\Downloads\\BBC_News_Train.csv")
df.head()

df.shape

df['Category'].value_counts()

import nltk
from nltk.corpus import stopwords
import string

Tushar Parikh 21084341003


27
def text_cleaning(a) :
remove_punctuation = [char for char in a if char not in string.punctuation]
remove_punctuation = ''.join(remove_punctuation)
return [word for word in remove_punctuation.split() if word.lower() not in
stopwords.words('english')]

print(df.iloc[:,1].apply(text_cleaning))

from sklearn.feature_extraction.text import CountVectorizer


bow_transformer = CountVectorizer(analyzer=text_cleaning).fit(df['Text'])
bow_transformer.vocabulary_

Tushar Parikh 21084341003


28
title_bow = bow_transformer.transform(df['Text'])
print(title_bow)

Tushar Parikh 21084341003


29
x = title_bow.toarray()
print(x)
x.shape

from sklearn.feature_extraction.text import TfidfTransformer


tfidf_transformer = TfidfTransformer().fit(title_bow)
print(tfidf_transformer)

title_tfidf = tfidf_transformer.transform(title_bow)
print(title_tfidf)
print(title_tfidf.shape)

Tushar Parikh 21084341003


30
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB().fit(title_tfidf,df['Category'])

all_predictions = model.predict(title_tfidf)
print(all_predictions)

from sklearn.metrics import confusion_matrix


confusion_matrix(df['Category'],all_predictions)

from sklearn.metrics import classification_report


print(classification_report(df['Category'], all_predictions))

Tushar Parikh 21084341003


31
15. Write a python program to converting words to features with NLTK.
Code :-
import nltk
nltk.download('movie_reviews')
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(field)), category)


for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

all_words = []

for w in movie_reviews.words() :
all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

word_features = list(all_words.keys())[:3000]

def find_features(documnet) :
words = set(documents)
features = {}
for w in word_features :

Tushar Parikh 21084341003


32
features[w] = (w in words)

return features

print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev),category) for (rev,category) in documents]


featuresets

Tushar Parikh 21084341003


33
Tushar Parikh 21084341003
34

You might also like