Workshop - NLP - Ipynb - Colaboratory
Workshop - NLP - Ipynb - Colaboratory
device(type='cuda')
df = pd.read_csv("/content/train.txt",delimiter=';',names=['text','label'])
df
text label
17997 i feel its important to share this info for th... joy
sns.countplot(df['label'])
17998 i truly feel that if you are passionate enough... joy
# sns.countplot(df.label)
17999 i feel like i just wanna buy any cute make up ... joy
/usr/local/lib/python3.7/dist-packages/seaborn/_decorators.py:43: FutureWarning: Pass
18000 rows × 2 columns
FutureWarning
<matplotlib.axes._subplots.AxesSubplot at 0x7f5baa2f20d0>
df.label.unique()
# def custom_encoder(df):
# df = df.replace(to_replace ="surprise", value =1)
# df = df.replace(to_replace ="love", value =1)
# df = df.replace(to_replace ="joy", value =1)
# df = df.replace(to_replace ="fear", value =0)
# df = df.replace(to_replace ="anger", value =0)
# df = df.replace(to_replace ="sadness", value =0)
# return df
# df['label'] = custom_encoder(df['label'])
def custom_encoder(df):
df.replace(to_replace ="surprise", value =1, inplace=True)
df.replace(to_replace ="love", value =1, inplace=True)
df.replace(to_replace ="joy", value =1, inplace=True)
df.replace(to_replace ="fear", value =0, inplace=True)
df.replace(to_replace ="anger", value =0, inplace=True)
df.replace(to_replace ="sadness", value =0, inplace=True)
custom_encoder(df['label'])
sns.countplot(df.label)
lm = WordNetLemmatizer()
stops = set(stopwords.words('english'))
# stops
def text_transformation(df_col):
corpus = []
for item in df_col:
new_item = re.sub('[^a-zA-Z]',' ',str(item)) #get rid of symbols($, Rs.) and only
new_item = new_item.lower()
new_item = new_item.split()
new_item = [lm.lemmatize(word) for word in new_item if word not in stops]
corpus.append(' '.join(str(x) for x in new_item))
return corpus
corpus = text_transformation(df['text'])
cv = CountVectorizer(ngram_range=(1,2))
traindata = cv.fit_transform(corpus)
X = traindata
y = df.label
rfc = RandomForestClassifier(max_features='auto',
max_depth=None,
n_estimators=500,
min_samples_split=5,
min_samples_leaf=1)
rfc.fit(X,y)
RandomForestClassifier(min_samples_split=5, n_estimators=500)
test_df = pd.read_csv('/content/test.txt',delimiter=';',names=['text','label'])
X_test,y_test = test_df.text,test_df.label
#encode the labels into two classes , 0 and 1
test_df = custom_encoder(y_test)
#pre-processing of text
test_corpus = text_transformation(X_test)
#convert text data into vectors
testdata = cv.transform(test_corpus)
#predict the target
predictions = rfc.predict(testdata)
plot_confusion_matrix(y_test,predictions)
acc_score = accuracy_score(y_test,predictions)
pre_score = precision_score(y_test,predictions)
rec_score = recall_score(y_test,predictions)
print('Accuracy_score: ',acc_score)
print('Precision_score: ',pre_score)
print('Recall_score: ',rec_score)
print("-"*50)
cr = classification_report(y_test,predictions)
print(cr)
Accuracy_score: 0.9615
Precision_score: 0.9616648411829135
Recall_score: 0.9543478260869566
--------------------------------------------------
precision recall f1-score support
def expression_check(prediction_input):
if prediction_input == 0:
print("Input statement has Negative Sentiment.")
elif prediction_input == 1:
print("Input statement has Positive Sentiment.")
else:
print("Invalid Statement.")
# function to take the input statement and perform the same transformations we did earlier
def sentiment_predictor(input):
input = text_transformation(input)
transformed_input = cv.transform(input)
prediction = rfc.predict(transformed_input)
expression_check(prediction)
sentiment_predictor(input1)
sentiment_predictor(input2)