T SNE Visualization of Amazon Reviews With Polarity Based Color Coding+
T SNE Visualization of Amazon Reviews With Polarity Based Color Coding+
(1)
import sqlite3
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
[('Reviews',)]
1 Data Cleaning
In [0]: n=25000
In [0]: # Give reviews with Score>3 a positive rating, and reviews with a score<3 a negative rat
def partition(x):
1
if x < 3:
return 'negative'
return 'positive'
#Deduplication of entries
final=sorted_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"}, keep='f
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
In [11]: final=pd.concat((negative_1500,positive_1500))
2 Text preprocessing
1. Removal of HTML tags
2. Removal of punctuation marks and special characters
3. Conversion to lowercase
4. Removal of 2 letter words
5. Removal of numeric characters
6. Stopword removal
7. Snowball stemming
In [0]: nltk.download()
2
Out[0]: True
In [0]: str1=None
str2=None
final_string_not_included=[]
final_string_not_excluded=[]
s=''
for sent in final['Text'].values:
filtered_sentence_not_included=[]
filtered_sentence_not_excluded=[]
sent=cleanhtml(sent) # remove HTMl tags
for w in sent.split():
for cleaned_words in cleanpunc(w).split():
if((cleaned_words.isalpha()) & (len(cleaned_words)>2)):
if cleaned_words.lower() not in stop_not_included:
s=(sno.stem(cleaned_words.lower())).encode('utf8')
filtered_sentence_not_included.append(s)
if cleaned_words.lower() not in stop_not_excluded:
s=(sno.stem(cleaned_words.lower())).encode('utf8')
filtered_sentence_not_excluded.append(s)
else:
continue
else:
continue
str1 = b" ".join(filtered_sentence_not_included) #final string of cleaned words with
str2 = b" ".join(filtered_sentence_not_excluded) #final string of cleaned words with
#print("***********************************************************************")
final_string_not_included.append(str1)
final_string_not_excluded.append(str2)
In [0]: final.head()['Text_not_included']
Out[0]: 22620 b'dog love chicken product china wont buy anym...
22621 b'dog love saw pet store tag attach regard mad...
2547 b'use victor fli bait season cant beat great p...
3
2546 b'product avail www amazon com victor trap unr...
1145 b'receiv shipment could hard wait tri product ...
Name: Text_not_included, dtype: object
3 Bag of words
In [0]: #bi-gram, tri-gram and n-gram
#removing stop words like "not" should be avoided before building n-grams
count_vect = CountVectorizer(ngram_range=(2,2) ) #in scikit-learn
final_bigram_counts_all_included = count_vect.fit_transform(final['Text'].values)
final_bigram_counts_not_included = count_vect.fit_transform(final['Text_not_included'].v
final_bigram_counts_not_excluded = count_vect.fit_transform(final['Text_not_excluded'].v
In [0]: arr=final_bigram_counts_all_included.getrow(6).toarray()
np.where(arr != 0)
In [0]: arr[0,358736]
Out[0]: 1
In [0]: num_negative=len(final[final['Score']=='negative'].values)
num_positive=len(final[final['Score']=='positive'].values)
In [9]: num_negative
Out[9]: 1500
In [10]: num_positive
Out[10]: 1500
In [12]: final_bigram_counts_not_excluded.get_shape()
3.1 Observation
Keep p less than 1500
In [0]: p=100
4
In [0]: model_all_included = TSNE(n_components=2, random_state=0, perplexity=p)
model_not_included = TSNE(n_components=2, random_state=0, perplexity=p)
model_not_excluded = TSNE(n_components=2, random_state=0, perplexity=p)
tsne_data_all_included = model_all_included.fit_transform(final_bigram_counts_all_includ
tsne_data_not_included = model_not_included.fit_transform(final_bigram_counts_not_includ
tsne_data_not_excluded = model_not_excluded.fit_transform(final_bigram_counts_not_exclud
In [65]: tsne_data_not_excluded
In [77]: plot_not_excluded.shape
Out[77]: (3000, 3)
In [78]: plot_all_included[:3]
Out[78]: f1 f2 label
0 -1.07917 0.52654 negative
1 1.16047 0.547074 negative
2 -1.03507 -1.26618 negative
5
In [18]: sns.FacetGrid(plot_not_included, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
6
In [19]: sns.FacetGrid(plot_not_excluded, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
7
In [23]: sns.FacetGrid(plot_all_included, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
8
In [24]: sns.FacetGrid(plot_not_included, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
9
In [25]: sns.FacetGrid(plot_not_excluded, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
10
4 TF-IDF
In [0]: from nltk.stem import SnowballStemmer
sno=SnowballStemmer('english')
11
text_stemmed.append(stemmed_review)
final['stemmed_text']=text_stemmed
In [41]: final['Text'][1]
Out[41]: 'Our dogs just love them. I saw them in a pet store and a tag was attached regarding t
In [42]: final['stemmed_text'][1]
Out[42]: b'our dog just love them saw them pet store and tag was attach regard them be made chin
tf_idf_vect_all_stopwords_removed = TfidfVectorizer(ngram_range=(2,2))
final_tf_idf_not_excluded = tf_idf_vect_all_stopwords_removed.fit_transform(final['Text_
tf_idf_vect_not_retained = TfidfVectorizer(ngram_range=(2,2))
final_tf_idf_not_included = tf_idf_vect_not_retained.fit_transform(final['Text_not_inclu
tsne_data_all_included = model_all_included.fit_transform(final_tf_idf_all_words_include
tsne_data_not_included = model_not_included.fit_transform(final_tf_idf_not_included.toar
tsne_data_not_excluded = model_not_excluded.fit_transform(final_tf_idf_not_excluded.toar
12
In [59]: sns.FacetGrid(plot_not_included, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
13
In [61]: sns.FacetGrid(plot_not_excluded, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
14
In [65]: sns.FacetGrid(plot_all_included, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
15
In [66]: sns.FacetGrid(plot_not_included, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
16
In [67]: sns.FacetGrid(plot_not_excluded, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
17
5 Observations
TF-IDF provides better separation of negative from positive reviews as we increase perplexity.
It can also be seen that best results obtained when all words are included to form bigrams as
compared to bigrams obtained by removing stopwords. If we exclude ’not’ from list of stopwords,
better results are obtained as compared to when ’not’ is included as stopword.
18
In [88]: sns.FacetGrid(plot_not_included, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
19
In [89]: sns.FacetGrid(plot_not_excluded, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_
20
6 Conclusion
Best perplexity 1250 with 1500 iterations
21
filtered_sentence.append(cleaned_words.lower())
else:
continue
list_of_sent.append(filtered_sentence)
In [24]: clean_text=[]
for sent in list_of_sent:
s=' '.join(sent)
clean_text.append(s)
In [28]: final['clean_text']=clean_text
In [51]: scaler=StandardScaler()
In [134]: np.shape(sent_vectors)
In [0]: scaled=scaler.fit_transform(sent_vectors)
tsne_model=TSNE(perplexity=900, n_iter=1500)
tsne_data=tsne_model.fit_transform(scaled)
22
In [141]: sns.FacetGrid(plot_tsne, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_legend(
23
8 Conclusion
Average word2Vec does not yield good results
9 TF-IDF word2Vec
In [29]: tf_idf_vect = TfidfVectorizer(ngram_range=(1,1))
final_tf_idf = tf_idf_vect.fit_transform(final['clean_text'].values)
tfidf_sent_vectors = []; # the tfidf-w2v for each sentence/review is stored in this lis
row=0;
cnt=0
for sent in list_of_sent: # for each review/sentence
24
sent_vec = np.zeros(50) # as word vectors are of zero length
weight_sum =0; # num of words with a valid vector in the sentence/review
for word in sent: # for each word in a review/sentence
try:
vec = w2v_model.wv[word]
# obtain the tf_idfidf of a word in a sentence/review
tfidf = final_tf_idf[row, tfidf_feat.index(word)]
sent_vec += (vec * tfidf)
weight_sum += tfidf
cnt+=1
except:
pass
sent_vec /= weight_sum
tfidf_sent_vectors.append(sent_vec)
row += 1
In [48]: tfidf_sent_vectors[1500]
In [82]: p=700
In [83]: scaled=scaler.fit_transform(tfidf_sent_vectors)
model=TSNE(perplexity=p, n_iter=1500)
In [84]: data=model.fit_transform(scaled)
25
In [75]: sns.FacetGrid(plot, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_legend() # p=
26
9.0.1 MinMaxScaled data embedded in 2 dimensions with 1000 p
In [81]: sns.FacetGrid(plot, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_legend() # p=
27
In [86]: sns.FacetGrid(plot, hue="label", size=6).map(plt.scatter, 'f1', 'f2').add_legend() # p=
28
10 Conclusion
Best classification can be achieved using TF-IDF
29