0% found this document useful (0 votes)
10 views4 pages

Sentiment Classifier

The document outlines a Python script that utilizes the CountVectorizer from sklearn to process text data from files, categorizing them into positive and negative sentiments. It initializes dictionaries to store the text data, counts occurrences of words, and calculates probabilities for each word based on its sentiment. The script also constructs a corpus from a limited number of positive and negative samples for further analysis.

Uploaded by

ravintej22
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views4 pages

Sentiment Classifier

The document outlines a Python script that utilizes the CountVectorizer from sklearn to process text data from files, categorizing them into positive and negative sentiments. It initializes dictionaries to store the text data, counts occurrences of words, and calculates probabilities for each word based on its sentiment. The script also constructs a corpus from a limited number of positive and negative samples for further analysis.

Uploaded by

ravintej22
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 4

from sklearn.feature_extraction.

text import CountVectorizer


f=open("cv000_29416.txt","r")
a=f.read()
f.close()

data=[]
data.append(a)

vectorizer = CountVectorizer()
vectorizer.fit(data)
nev_set=vectorizer.vocabulary_
print(vectorizer.vocabulary_)

import os

def init_dict(a_dir):
a_dict = {}
file_list = os.listdir(a_dir)
for a_file in file_list:
f = open(a_dir + a_file, 'r')
a_dict[a_file] = f.read()
f.close()
return a_dict
def print_dict(a_dict):
for key in sorted(a_dict.keys()):
print (key, ":\n" , a_dict.get(key))

a = vectorizer.transform(data)

a.shape

import pandas as pd
from sklearn. feature_extraction. text import CountVectorizer
import os

from sklearn. feature_extraction. text import CountVectorizer

f=open("/content/neg/cv001_19502.txt","r")
a=f.read()
f.close()

data=[]
data.append(a)

vectorizer = CountVectorizer()
vectorizer.fit(data)
nev_set=vectorizer.vocabulary_
print(vectorizer.vocabulary_)

f=open("/content/pos/cv000_29590.txt","r")
a=f.read()
f.close()

data=[]
data.append(a)
vectorizer = CountVectorizer()
vectorizer.fit(data)
nev_set=vectorizer.vocabulary_
print(vectorizer.vocabulary_)

def init_dict(a_dir):
a_dict = {}
file_list = os.listdir(a_dir)
for a_file in file_list:
f = open(a_dir + a_file, 'r')
a_dict[a_file] = f.read()
f.close()
return a_dict
def print_dict(a_dict):
for key in sorted(a_dict.keys()):
print (key, ":\n" , a_dict.get(key))

pos = init_dict("pos/")
neg = init_dict("neg/")

len(pos)

len(neg)

print_dict(pos)

print_dict(neg)

import pandas as pd
import os

directory = os.fsencode("/content/neg")
os.chdir(directory)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".txt") or filename.endswith(".txt"):
with open(filename, "r") as a_file:
for line in a_file:
datarow = {'text':line ,'tag': "neg"}
data = data.append(datarow)
continue
else:
continue

data = pd.DataFrame(columns=["text","tag"])

import os

directory = os.fsencode("/content/neg")
os.chdir(directory)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".txt") or filename.endswith(".txt"):
with open(filename, "r") as a_file:
for line in a_file:
datarow = {'text':line ,'tag': "neg"}
data = data.append(datarow , ignore_index=True)
continue
else:
continue

data

data.head(10)

import os

directory = os.fsencode("/content/neg")
os.chdir(directory)
for file in os.listdir(directory):
filename = os.fsdecode(file)
if filename.endswith(".txt") or filename.endswith(".txt"):
with open(filename, "r") as a_file:
for line in a_file:
datarow = {'text':line ,'tag': "pos"}
data = data.append(datarow , ignore_index=True)
continue
else:
continue

data

data.head(10)

dict ={}
import re
for i in range(len(data)):
row = data.iloc[i,:]
if(i<=31783):
for word in row.text.split(" "):
if word in dict:
dict[word]["pos"] = dict[word]["pos"] +1
else:
dict[word] = {"pos":0,"neg":0}
dict[word]["pos"] = 1;
else:
for word in row.text.split(" "):
if word in dict:
dict[word]["neg"] = dict[word]["neg"] +1
else:
dict[word] = {"pos":0,"neg":1}
dict[word]["neg"] = 1;

dict

p = ()
count = 0;
for word in dict:
count=count + dict[word]["pos"]
count=count + dict[word]["neg"]

for word in dict:


dict[word]["prob"] = (dict[word]["pos"] + dict[word]["neg"])/count

for word in dict:


dict[word]["con_pos_prob"] = dict[word]["pos"]/(dict[word]["pos"] + dict[word]
["neg"])
dict[word]["con_neg_prob"] = dict[word]["neg"]/(dict[word]["pos"] + dict[word]
["neg"])

dict

from sklearn.feature_extraction.text import CountVectorizer

j=0
count = 0
corpus = []
for i in pos:
if(count < 700):
corpus.append(pos[i])
j = j + 1
count = count+1
j=0
count=0
for i in neg:
if(count < 700):
corpus.append(neg[i])
j = j + 1
count = count+1
len(corpus)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
x = vectorizer.get_feature_names()

X_1 = X.toarray()
print(X_1)

You might also like