Step 1: Finding The Data Set: "Amazon - Reviews - Multilingual - UK - v1 - 00.tsv - GZ" 'RT' "Utf8"
Step 1: Finding The Data Set: "Amazon - Reviews - Multilingual - UK - v1 - 00.tsv - GZ" 'RT' "Utf8"
In [1]:
import gzip
path = "amazon_reviews_multilingual_UK_v1_00.tsv.gz"
f = gzip.open(path, 'rt', encoding="utf8")
In [2]:
header = f.readline()
header = header.strip().split('\t')
print(header)
In [3]:
dataset = []
In [4]:
for line in f:
fields = line.strip().split('\t')
d = dict(zip(header, fields))
d['star_rating'] = int(d['star_rating'])
d['helpful_votes'] = int(d['helpful_votes'])
d['total_votes'] = int(d['total_votes'])
for field in ['verified_purchase','vine']:
if d[field] == 'Y':
d[field]=True
else:
d[field]=False
dataset.append(d)
localhost:8891/notebooks/course3project.ipynb# 1/4
9/24/2020 course3project - Jupyter Notebook
In [5]:
dataset[20]
Out[5]:
{'marketplace': 'UK',
'customer_id': '20222',
'review_id': 'R3I6A1LWUUVBRE',
'product_id': 'B0002CVQCW',
'product_parent': '281008695',
'product_title': "Les Miserables 10th Anniversary Concert At The Royal Albe
rt Hall (2 Disc Collector's Edition) [DVD]",
'product_category': 'Video DVD',
'star_rating': 5,
'helpful_votes': 0,
'total_votes': 0,
'vine': False,
'verified_purchase': True,
'review_headline': 'some of the best voices in the world',
'review_body': 'I liked it so much I bought it twice just so that I could s
hare it with a friend. Excellant',
'review_date': '2013-02-26'}
In [6]:
import random
random.shuffle(dataset)
N = len(dataset)
trainingSet = dataset[:4*N//5]
testingSet = dataset[4*N//5:]
localhost:8891/notebooks/course3project.ipynb# 2/4
9/24/2020 course3project - Jupyter Notebook
In [7]:
# Defining the feature function and the implementation will be based on star rating and len
from collections import defaultdict
from nltk.stem.porter import PorterStemmer
import string
wordCount = defaultdict(int)
stemmer = PorterStemmer() #use stemmer.stem(stuff)
for d in trainingSet:
f = ''.join([x for x in d['review_body'].lower() if not x in string.punctuation])
for w in f.split():
w = stemmer.stem(w) # with stemming
wordCount[w]+=1
def feature(dat):
feat = [1, dat['star_rating'], len(wordCount)]
return feat
In [8]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
Out[8]:
LogisticRegression()
localhost:8891/notebooks/course3project.ipynb# 3/4
9/24/2020 course3project - Jupyter Notebook
In [9]:
TP_train = 1041287
FP_train = 324708
TN_train = 0
FN_train = 0
TF_Accuracy: 76.23%
BER_train = 0.5
In [ ]:
localhost:8891/notebooks/course3project.ipynb# 4/4