Shubham Jade MSC It 31031420010 NLP Practical Journal
Shubham Jade MSC It 31031420010 NLP Practical Journal
MSc IT
31031420010
NLP practical Journal
Index
8 Word Similarity
Stemming Example 1
from nltk.stem import PorterStemmer as ps
text4=(“I am a Student of Somaiya University.”).split()
print(text4)
for w in text4:
rootWord=ps().stem(w)
print(rootWord)
Stemming Example 2
words=["Unexpected", "disagreement", "disagree", "agreement",
"quirkiness", "historical", "canonical", "happiness", "unkind",
"dogs", "expected"]
for w in words:
stemPrint=ps.stem(w)
print(w,” -Stem- ”,stemPrint)
Lemmatization Example 1
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text5 = "I am Studying in Part 2."
tokenization = nltk.word_tokenize(text5)
for w in tokenization:
print("Lemma for {} is {}".format(w,
wordnet_lemmatizer.lemmatize(w))
Lemmatization Example 2
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
words2=["Unexpected", "disagreement", "disagree", "agreement",
"quirkiness", "historical", "canonical", "happiness", "unkind",
"dogs", "expected", “studies”,”cries”,”applies”]
for w in words2:
print("Lemma for {} is {}".format(w,
wordnet_lemmatizer.lemmatize(w)))
Practical 2
Implement the python program that splits the words and displays both splitted words and count
of words in the given sentences using the tokenizer function.
text_word_tokens = []
for sentence_token in text_sentence_tokens:
text_word_tokens.append(word_tokenize(sentence_token))
print(text_word_tokens)
#POS Tag Word Tokens
text_tagged = pos_tag_sents(text_word_tokens)
print (text_tagged)
#Default tagging
from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
tagger.tag(['Hello', 'World'])
#Evaluating Accuracy
import nltk
nltk.download('treebank')
from nltk.corpus import treebank
test_sents = treebank.tagged_sents()[3000:]
tagger.evaluate(test_sents)
#Tagging Sentence
tagger.tag_sents([['Hello', 'world', '.'], ['How', 'are', 'you','?']])
#Untagging a tagged sentence
from nltk.tag import untag
grammar1 = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP | V NP PP
PP -> P NP
V -> "saw" | "ate" | "walked"
NP -> "John" | "Mary" | "Bob" | Det N | Det N PP
Det -> "a" | "an" | "the" | "my"
N -> "man" | "dog" | "cat" | "telescope" | "park"
P -> "in" | "on" | "by" | "with"
""")
sent = "Mary saw Bob".split()
rd_parser = nltk.RecursiveDescentParser(grammar1)
for tree in rd_parser.parse(sent):
print(tree)
Ex 2
import nltk
from nltk.parse import RecursiveDescentParser
Prod_rule=nltk.CFG.fromstring("""
S -> NP VP
NP -> N
NP -> Det N
VP -> V NP
VP -> V
N -> 'Person Name' | 'He' | 'She' | 'Boy' | 'Girl' | 'It' | 'cricket' |
'song' | 'book'
V -> 'likes' | 'reads' | 'sings'
""")
sent='He likes cricket'
sent1=sent.split()
sent1
parser = nltk.RecursiveDescentParser(Prod_rule)
parser
for t in parser.parse(sent1):
print(t)
Ex 3
Simple_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
Det -> 'an' | 'my'
N -> 'elephant' | 'pajamas'
V -> 'shot'
P -> 'in'
""")
sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
sent
parser = nltk.ChartParser(Simple_grammar)
for tree in parser.parse(sent):
print(tree)
Practical 5
Write a python code to find the term frequency and inverse document frequency for three
documents. (Consider 3 documents as 3 paragraphs)
Ex 1
# Getting bigrams
vectorizer = CountVectorizer(ngram_range =(2, 2))
X1 = vectorizer.fit_transform(txt1)
features = (vectorizer.get_feature_names())
print("\n\nX1 : \n", X1.toarray())
# Applying TFIDF
# You can still get n-grams here
Ex 2
# Importing libraries
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
#Sentence Generation
import itertools
from nltk.grammar import CFG
from nltk.parse import generate
demo_grammar = """
S -> NP VP
NP -> Det N
PP -> P NP
VP -> 'slept' | 'saw' NP | 'walked' PP
Det -> 'the' | 'a'
N -> 'man' | 'park' | 'dog'
P -> 'in' | 'with'
"""
grammar = CFG.fromstring(demo_grammar)
for n, sent in enumerate(generate.generate(grammar, n=10), 1):
print('%3d. %s' % (n, ' '.join(sent)))
Ex 2
from nltk.grammar import Nonterminal
from nltk.grammar import toy_pcfg2
from nltk.probability import DictionaryProbDist
productions = toy_pcfg2.productions()
# Get all productions with LHS=NP
np_productions = toy_pcfg2.productions(Nonterminal('NP'))
dict = {}
for pr in np_productions: dict[pr.rhs()] = pr.prob()
np_probDist = DictionaryProbDist(dict)
# Each time you call, you get a random sample
print(np_probDist.generate())
(Det, N)
print(np_probDist.generate())
(Name,)
print(np_probDist.generate())
(Name,)
pcfg_generate(grammar) -- return a tree sampled from the language described by the PCFG
grammar
Practical 8
Given two words, calculate the similarity between the words
a. By using path similarity.
b. By using Wu-Palmer Similarity.
#Synsets
from nltk.corpus import wordnet
syn1 = wordnet.synsets('hello')[0]
syn2 = wordnet.synsets('selling')[0]
print ("hello name : ", syn1.name())
print ("selling name : ", syn2.name())
a. By using path similarity.
ref = syn1.hypernyms()[0]
print ("Self comparison : ",
syn1.shortest_path_distance(ref))
print ("Distance of hello from greeting : ",
syn1.shortest_path_distance(syn2))
print ("Distance of greeting from hello : ",
syn2.shortest_path_distance(syn1))
b. By using Wu-Palmer Similarity.
syn1.wup_similarity(syn2)
Practical 9
Consider a sentence and do the following.
a. Import the libraries.
b. Then apply word tokenization and Part-Of-Speech tagging to the sentence.
c. Create a chunk parser and test it on the sentence.
d. Identify nationalities or religions or political groups, organization, date and money in the
given sentence.
(Select sentence appropriately)
import nltk
from nltk.corpus import wordnet
synonyms = []
antonyms = []
for syn in wordnet.synsets("good"):
for l in syn.lemmas():
synonyms.append(l.name())
if l.antonyms():
antonyms.append(l.antonyms()[0].name())
print(set(synonyms))
print(set(antonyms))
5. Discern Hypernyms and Hyponyms in Synset.
#hypernym of synset
syn.hypernyms()
#Similar synsets
syn.hypernyms()[0].hyponyms()
#Tree path of synset
syn.hypernym_paths()
#POS of synset
syn.pos()
len(wordnet.synsets('great'))
len(wordnet.synsets('great', pos='n'))
len(wordnet.synsets('great', pos='a'))
f. Compare the similarity index of any two words
import nltk
from nltk.corpus import wordnet
# Let's compare the noun of "ship" and "boat:"
w1 = wordnet.synset('run.v.01') # v here denotes the tag verb
w2 = wordnet.synset('sprint.v.01')
print(w1.wup_similarity(w2))