SK NLP Practical (FS)
SK NLP Practical (FS)
SK NLP Practical (FS)
Somaiya College
Autonomous
CERTIFICATE
Certified that the experimental work as entered in this journal is
as per syllabus in M.Sc. Information Technology for NLP Practical as
prescribed by Somaiya University and was done by the student Name:
Fardin Basuddin Shaikh having Seat No: 31031422041 Of class
M.Sc. Information Technology during the academic year 2023 -
2024.
No. of Experiments complete 10 out of 10
Aim: Implement a python code that will generate to get the root words in the given
sentences.
Source Code:
pip install nltk
import nltk
nltk.download("all")
PorterStemmer:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
# create an object of class PorterStemmer
porter = PorterStemmer()
print(porter.stem("play"))
print(porter.stem("playing"))
print(porter.stem("plays"))
print(porter.stem("played"))
for w in words:
print(w, " : ", porter.stem(w))
Output:
Snowball Stemmer:
#Snowball stemming algorithm
from nltk.stem.snowball import SnowballStemmer
snow = SnowballStemmer(language='english')
sentence = "Programmers coded with programming languages and using different
framework and technologies"
words = word_tokenize(sentence)
for w in words:
print(w, " : ", snow.stem(w))
Output:
Lancaster Stemmer:
from nltk.stem import LancasterStemmer
Lanc_stemmer = LancasterStemmer()
sentence = "Programmers program with programming languages"
words = word_tokenize(sentence)
for w in words:
print(w, " : ", Lanc_stemmer.stem(w))
Output:
RegExp:
from nltk.stem import RegexpStemmer
regexp = RegexpStemmer('ing$|s$|e$|able$|ion$', min=4)
words =
['connecting','connect','connects','fractionally','fractions',"consult","consulatio
n", "consulting", "consults"]
for word in words:
print(word,"--->",regexp.stem(word))
Output:
lemmatizer = WordNetLemmatizer()
Output:
Lemmatization using Spacy:
pip3 install spacy
python -m spacy download en_core_web_sm
import spacy
Output:
Practical No 2
Aim: Implement a python program that splits the words and displays both splitted words and
count of the words in the given sentence using tokenizer function.
Source Code:
from nltk.tokenize import word_tokenize
text = "GeeksforGeeks is a Computer Science platform."
tokenized_text = word_tokenize(text)
print("Spilt Words: ", tokenized_text)
print("Count of Words: ", len(tokenized_text))
Output:
Practical No 3
Aim: Write a python program to read a paragraph and generate the tokens from the
paragraph using sentence tokenizer. Also find the parts of speech for each word in the
individual tokens that have been generated.
Source Code:
English Language:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
def tokenize_and_find_pos(paragraph):
sentences = sent_tokenize(paragraph)
# Example paragraph
paragraph = "Natural language processing (NLP) is a subfield of linguistics,
computer science, and artificial intelligence concerned with the interactions
between computers and human (natural) languages. It is used to apply algorithms to
identify and extract the natural language rules such that the unstructured language
data is converted into a form that computers can understand."
tokenize_and_find_pos(paragraph)
Output:
Non-English Language:
#Non-English Tokenization
import nltk
nltk.download('punkt')
german_tokenizer = nltk.data.load('tokenizers/punkt/german.pickle')
german_tokens=german_tokenizer.tokenize('Wie geht es Ihnen? Gut,danke.')
print(german_tokens)
ps = pos_tag(german_tokens)
print(ps)
Output:
Practical No 4
Aim: Draw a parse tree using python for any given sentence in a required grammar rule
using the chunk parsing.
Source Code:
import nltk
from nltk import RegexpParser
from nltk.tokenize import word_tokenize
#from IPython.display import display
# Download necessary NLTK resources
nltk.download('punkt')
# Define a sample grammar rule
grammar = r"""
NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
CLAUSE: {<NP><VP>} # Chunk NP, VP pairs
"""
# Create a chunk parser
chunk_parser = RegexpParser(grammar)
# Define a sample sentence
sentence = "The quick brown fox jumps over the lazy dog"
# Tokenize the sentence
tokens = word_tokenize(sentence)
# Perform POS tagging
tagged_tokens = nltk.pos_tag(tokens)
# Apply chunk parsing
parse_tree = chunk_parser.parse(tagged_tokens)
# Display parse tree
parse_tree.pretty_print()
Output:
Practical No 5
Aim: Write a python code to find the term frequency and inverse document frequency for
three documents. (Consider 3 documents as 3 paragraphs)
Source Code:
import math
from collections import Counter
def calculate_tf(text):
words = text.lower().split()
word_count = len(words)
word_freq = Counter(words)
tf = {word: freq / word_count for word, freq in word_freq.items()}
return tf
def calculate_idf(documents):
total_docs = len(documents)
idf = {}
all_words = [word for document in documents for word in
set(document.lower().split())]
for word in all_words:
doc_count = sum([1 for document in documents if word in
document.lower().split()])
idf[word] = math.log(total_docs / (1 + doc_count))
return idf
# Example documents
document1 = "This is the first document. It contains words to analyze term
frequency and inverse document frequency."
document2 = "The second document has some overlapping words with the first document
but also includes unique terms."
document3 = "Finally, the third document is shorter and has fewer words compared to
the other two documents."
documents = [document1, document2, document3]
# Calculate TF for each document
tf_documents = [calculate_tf(document) for document in documents]
# Calculate IDF for all documents
idf = calculate_idf(documents)
print("Term Frequency (TF) for each document:")
for i, tf_doc in enumerate(tf_documents, start=1):
print(f"Document {i}: {tf_doc}")
print("\nInverse Document Frequency (IDF) for all words:")
for word, idf_value in idf.items():
print(f"{word}: {idf_value}")
Output:
Practical No 6
Aim: Implement a python code to remove Stop words and identify Parts of Speech for a
given paragraph.
Source Code:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
def remove_stopwords(text):
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
filtered_text = [word for word in word_tokens if word.lower() not in stop_words]
return ' '.join(filtered_text)
def identify_pos(text):
sentences = sent_tokenize(text)
tagged_sentences = [pos_tag(word_tokenize(sentence)) for sentence in sentences]
return tagged_sentences
# Example paragraph
paragraph = """
Natural Language Processing (NLP) is a subfield of artificial
intelligence concerned with the interaction between computers and
humans in natural language. It focuses on the interaction between
computers and humans in the natural language and it is a field at the
intersection of computer science, artificial intelligence, and
computational linguistics.
"""
print("paragraph: ", paragraph)
# Remove stop words
paragraph_without_stopwords = remove_stopwords(paragraph)
print("Paragraph without stopwords:")
print(paragraph_without_stopwords)
# Identify Parts of Speech
tagged_sentences = identify_pos(paragraph)
print("\nParts of speech:")
for sentence in tagged_sentences:
print(sentence)
Output:
Practical No 7
Aim: Find the probability for a given sentence and also all the words present in the sentence
must be in the toy_pcfg1 or toy_pcfg2 using Viterbi pcfg parsing.
Source Code:
toy_pcfg1 = {
'S': [(('NP', 'VP'), 0.9), (('VP',), 0.1)],
'NP': [(('Det', 'N'), 0.8), (('N',), 0.2)],
'VP': [(('V', 'NP'), 1.0)],
'Det': [(('the',), 0.6), (('a',), 0.4)],
'N': [(('cat',), 0.5), (('dog',), 0.5)],
'V': [(('chased',), 1.0)]
}
toy_pcfg2 = {
'S': [(('NP', 'VP'), 1.0)],
'NP': [(('Det', 'N'), 1.0)],
'VP': [(('V', 'NP'), 0.5), (('V',), 0.5)],
'Det': [(('a',), 1.0)],
'N': [(('mouse',), 1.0)],
'V': [(('slept',), 0.5), (('ran',), 0.5)]
}
def get_terminals(pcfg):
terminals = set()
for productions in pcfg.values():
for rhs, _ in productions:
if isinstance(rhs, str):
terminals.add(rhs)
else:
for symbol in rhs:
if isinstance(symbol, str):
terminals.add(symbol)
return terminals
Output:
Practical No 8
Aim: Given two words, calculate the similarity between the words
a. By using Path Similarity
b. By using Wu-Palmer Similarity
Source Code:
import nltk
synset1 = synsets1[0]
synset2 = synsets2[0]
path_similarity = synset1.path_similarity(synset2)
wup_similarity = synset1.wup_similarity(synset2)
word1 = "dog"
word2 = "cat"
calculate_similarities(word1, word2)
Output:
Practical No 9
sentence = "In August, India and Microsoft plan to address the issue of climate
change and alloted $5000000 for it."
tokens = word_tokenize(sentence)
tagged = pos_tag(tokens)
iob_tagged = tree2conlltags(ne_chunked)
print("IOB Tags:", iob_tagged)
Source Code:
import nltk
hello_synsets = wn.synsets('hello')
print("All synsets for 'hello':", hello_synsets)
first_synset = hello_synsets[0]
print("\nFirst Synset:", first_synset)
first_lemma = first_synset.lemmas()[0].name()
print("First lemma name of the 0th Synset:", first_lemma)
synset_name = first_synset.name()
synset_definition = first_synset.definition()
synset_examples = first_synset.examples()
print("\nName of the 0th Synset:", synset_name)
print("Definition of the 0th Synset:", synset_definition)
print("Examples of the 0th Synset:", synset_examples)
hypernyms = first_synset.hypernyms()
hyponyms = first_synset.hyponyms()
print("\nHypernyms of the 0th Synset:", hypernyms)
print("Hyponyms of the 0th Synset:", hyponyms)
Output: