Soundarya 256 NLP Practs
Soundarya 256 NLP Practs
PRACTICAL NO: 1
AIM:
Write a program to implement sentence segmentation and word tokenization
THEORY:
CODES:
import nltk
from nltk.tokenize import word_tokenize
with open('soundarya.txt') as f:
lines = f.readlines()
1
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
TEXT DOCUMENT:
OUTPUT:
2
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
PRACTICAL NO: 2
AIM:
Write a program to Implement stemming and lemmatization.
THEORY:
3
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
A] Stemming:
CODES:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import LancasterStemmer
words = ['run', 'runner', 'running', 'ran', 'runs', 'easily', 'fairly']
def portstemming(words):
ps=PorterStemmer()
print("Porter Stemmer")
for word in words:
print(word,"--->",ps.stem(word))
def snowballstemming(words):
snowball = SnowballStemmer(language='english')
print("Snowball Stemmer")
for word in words:
print(word,"--->",snowball.stem(word))
def lancasterstemming(words):
lancaster = LancasterStemmer()
print("Lancaster Stemmer")
for word in words:
print(word,"--->",lancaster.stem(word))
4
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
print("Select operation.")
print("1.Porter Stemmer")
print("2.Snowball Stemmer")
print("3.Lancaster Stemmer")
while True:
choice = input("Enter choice(1/2/3): ")
if choice in ('1', '2', '3'):
if choice == '1':
print(portstemming(words))
elif choice == '2':
print(snowballstemming(words))
elif choice == '3':
print(lancasterstemming(words))
else:
print("Invalid Input")
5
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
OUTPUT:
6
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
B] Lemmatization:
CODES:
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))
OUTPUT:
7
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
PRACTICAL NO: 3
AIM:
Write a program to Implement a tri-gram model.
THEORY:
CODES:
import nltk
from nltk.corpus import inaugural
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
f = open("corpus.txt", "r")
corpus = f.read()
8
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
average_tokens = round(len(words)/len(sents))
print("The average number of tokens per sentence is",average_tokens)
unique_tokens = set(words)
print("The number of unique tokens are", len(unique_tokens))
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
final_tokens = []
for each in words:
if each not in stop_words:
final_tokens.append(each)
print("The number of total tokens after removing stopwords are", len((final_tokens)))
9
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
print ("Most common n-grams without stopword removal and without add-1 smoothing: \n")
print("Most common bigrams:", freq_bi.most_common (5))
print("InMost common trigrams:",freq_tri.most_common (5))
print (" InMost common fourgrams:",freq_four.most_common (5))
10
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
token_1 = word_tokenize(str1)
token_2 = word_tokenize(str2)
ngram_1 = {1:[], 2:[], 3:[]} #to store the n-grams formed
ngram_2 = {1:[], 2:[], 3:[]}
for i in range(3):
ngram_1[i+1] = list(ngrams(token_1, i+1))[-1]
ngram_2[i+1] = list(ngrams(token_2, i+1))[-1]
print("String 1: ", ngram_1,"\nString 2: ",ngram_2)
11
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
total_ngrams[i+1] = len(ngrams_all[i+1])
total_voc[i+1] = len(ngrams_voc[i+1])
for i in range(4):
for ngram in ngrams_prob[i+1]:
ngram[-1] = (ngram[-1]+1)/(total_ngrams[i+1]+total_voc[i+1]) #add-1 smoothing
for i in range(4):
ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
count +=1
pred_1[i+1].append(each[0][-1])
if count ==5:
break
if count<5:
12
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
while(count!=5):
pred_1[i+1].append("NOT FOUND")
#if no word prediction is found, replace with NOT FOUND
count +=1
for i in range(4):
ngrams_prob[i+1] = sorted(ngrams_prob[i+1], key = lambda x:x[1], reverse = True)
print("Next word predictions for the strings using the probability models of bigrams, trigrams,
and fourgrams\n")
print(str1)
print("Bigram model predictions: {}\nTrigram model predictions: {}\nFourgram model
predictions: {}\n" .format(pred_1[1], pred_1[2], pred_1[3]))
print(str2)
print("Bigram model predictions: {}\nTrigram model predictions: {}\nFourgram model
predictions: {}" .format(pred_2[1], pred_2[2], pred_2[3]))
13
NAME: SOUNDARYA NATURAL LANGUAGE PROCESSING JOURNAL ROLL NO: 246
OUTPUT:
14