Sentiment Classifier
Sentiment Classifier
data=[]
data.append(a)
vectorizer = CountVectorizer()
vectorizer.fit(data)
nev_set=vectorizer.vocabulary_
print(vectorizer.vocabulary_)
import os
def init_dict(a_dir):
a_dict = {}
file_list = os.listdir(a_dir)
for a_file in file_list:
f = open(a_dir + a_file, 'r')
a_dict[a_file] = f.read()
f.close()
return a_dict
def print_dict(a_dict):
for key in sorted(a_dict.keys()):
print (key, ":\n" , a_dict.get(key))
a = vectorizer.transform(data)
a.shape
import pandas as pd
from sklearn. feature_extraction. text import CountVectorizer
import os
f=open("/content/neg/cv001_19502.txt","r")
a=f.read()
f.close()
data=[]
data.append(a)
vectorizer = CountVectorizer()
vectorizer.fit(data)
nev_set=vectorizer.vocabulary_
print(vectorizer.vocabulary_)
f=open("/content/pos/cv000_29590.txt","r")
a=f.read()
f.close()
data=[]
data.append(a)
vectorizer = CountVectorizer()
vectorizer.fit(data)
nev_set=vectorizer.vocabulary_
print(vectorizer.vocabulary_)
def init_dict(a_dir):
a_dict = {}
file_list = os.listdir(a_dir)
for a_file in file_list:
f = open(a_dir + a_file, 'r')
a_dict[a_file] = f.read()
f.close()
return a_dict
def print_dict(a_dict):
for key in sorted(a_dict.keys()):
print (key, ":\n" , a_dict.get(key))
pos = init_dict("pos/")
neg = init_dict("neg/")
len(pos)
len(neg)
print_dict(pos)
print_dict(neg)
import pandas as pd
import os
directory = os.fsencode("/content/neg")
os.chdir(directory)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".txt") or filename.endswith(".txt"):
with open(filename, "r") as a_file:
for line in a_file:
datarow = {'text':line ,'tag': "neg"}
data = data.append(datarow)
continue
else:
continue
data = pd.DataFrame(columns=["text","tag"])
import os
directory = os.fsencode("/content/neg")
os.chdir(directory)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".txt") or filename.endswith(".txt"):
with open(filename, "r") as a_file:
for line in a_file:
datarow = {'text':line ,'tag': "neg"}
data = data.append(datarow , ignore_index=True)
continue
else:
continue
data
data.head(10)
import os
directory = os.fsencode("/content/neg")
os.chdir(directory)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".txt") or filename.endswith(".txt"):
with open(filename, "r") as a_file:
for line in a_file:
datarow = {'text':line ,'tag': "pos"}
data = data.append(datarow , ignore_index=True)
continue
else:
continue
data
data.head(10)
dict ={}
import re
for i in range(len(data)):
row = data.iloc[i,:]
if(i<=31783):
for word in row.text.split(" "):
if word in dict:
dict[word]["pos"] = dict[word]["pos"] +1
else:
dict[word] = {"pos":0,"neg":0}
dict[word]["pos"] = 1;
else:
for word in row.text.split(" "):
if word in dict:
dict[word]["neg"] = dict[word]["neg"] +1
else:
dict[word] = {"pos":0,"neg":1}
dict[word]["neg"] = 1;
dict
p = ()
count = 0;
for word in dict:
count=count + dict[word]["pos"]
count=count + dict[word]["neg"]
dict
j=0
count = 0
corpus = []
for i in pos:
if(count < 700):
corpus.append(pos[i])
j = j + 1
count = count+1
j=0
count=0
for i in neg:
if(count < 700):
corpus.append(neg[i])
j = j + 1
count = count+1
len(corpus)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
x = vectorizer.get_feature_names()
X_1 = X.toarray()
print(X_1)