Machine Learning NLP LAB Sayak Mallick
Machine Learning NLP LAB Sayak Mallick
1. Sentiment Analysis
import re
from matplotlib import rcParams
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from nltk.corpus import stopwords
from wordcloud import WordCloud
df_train=pd.read_csv("train.txt",delimiter=';',names=['text','label'])
df_val=pd.read_csv("val.txt",delimiter=';',names=['text','label'])
print(df_val)
df=pd.concat([df_train, df_val])
df.reset_index(inplace=True,drop=True)
print(df)
print("Shape of the Data frame: ",df.shape)
print(df.sample(5))
sns.countplot(df.label, data=df)
plt.show()
def custom_encoder(df):
df.replace(to_replace ="surprise", value =1, inplace=True)
df.replace(to_replace ="love", value =1, inplace=True)
df.replace(to_replace ="joy", value =1, inplace=True)
df.replace(to_replace ="fear", value =0, inplace=True)
df.replace(to_replace ="anger", value =0, inplace=True)
df.replace(to_replace ="sadness", value =0, inplace=True)
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from pylab import rcParams
lm=WordNetLemmatizer()
def text_transformation(df_col):
corpus = []
for item in df_col:
new_item = re.sub('[^a-zA-Z]',' ',str(item))
new_item = new_item.lower()
new_item = new_item.split()
new_item = [lm.lemmatize(word) for word in new_item if word not in
set(stopwords.words('english'))]
corpus.append(' '.join(str(x) for x in new_item))
return corpus
corpus=text_transformation(df['text'])
rcParams['figure.figsize'] = 20,8
word_cloud=""
for row in corpus:
for word in row:
word_cloud += " ".join(word)
wordcloud=WordCloud(width=1000,height=500,
background_color='white',min_font_size=10).generate(word_cloud)
plt.imshow(wordcloud)
2. ngram program
from nltk import ngrams
import numpy
def remove(string):
return string.replace(" ", "")
vocab = "Today is a good day to learn natural language proccesing"
print("Sample Document - ",vocab)
#contructing lexicon
lex = vocab.split(" ")
lex
spaced = ' '
for i in lex[0]:
spaced = spaced + i + " "
spaced = "$ " + spaced + " $"
n=3
ngrams_ = ngrams(spaced.split(), n)
ngram_list = []
for i in ngrams_:
ngram_list.append((''.join([w + ' ' for w in i])).strip())
for i in range(len(ngram_list)):
ngram_list[i] = remove(ngram_list[i])
ngram_list
ngram_list = []
for word in lex:
spaced = ' '
for i in word:
spaced = spaced + i + " "
spaced = "$ " + spaced + " $"
n=3
ngrams_ = ngrams(spaced.split(), n)
l = []
for i in ngrams_:
l.append((''.join([w + ' ' for w in i])).strip())
for i in range(len(l)):
l[i] = remove(l[i])
ngram_list.append(l)
ngram_list