0% found this document useful (0 votes)
7 views14 pages

NLP Pratical

Uploaded by

ritikapurse1111
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
7 views14 pages

NLP Pratical

Uploaded by

ritikapurse1111
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 14

Practical 1 :

Write program to perform tokenization over word and sentence on English and
Hindi Text.

INPUT

!pip install nltk

import nltk
nltk.download('punkt_tab')

# Importing the library


import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

# Importing the Data


dataset = """Hello Mr. Watson, how are you doing today?
The weather is awsome. The garden is green.
We should go out for a walk."""

# Tokenize the Sentences


print(sent_tokenize(dataset))

for i in sent_tokenize(dataset):
print(i)

OUTPUT
['Hello Mr. Watson, how are you doing today?', 'The weather is awsome.', 'The garden is green.',
'We should go out for a walk.'] Hello Mr. Watson, how are you doing today? The weather is
awsome. The garden is green. We should go out for a walk.

# Tokenize the Words


print(word_tokenize(dataset))

OUTPUT
['Hello', 'Mr.', 'Watson', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'awsome', '.',
'The', 'garden', 'is', 'green', '.', 'We', 'should', 'go', 'out', 'for', 'a', 'walk', '.']

from nltk.tokenize import word_tokenize


print("word_tokenize",word_tokenize(dataset))

OUTPUT
den', 'is', 'green', '.', 'We', 'should', 'go', 'out', 'for', 'a', 'walk', '.']
from nltk.tokenize import TreebankWordTokenizer
#tokenizers work by separating the words using punctuation and
spaces.
tokenizer = TreebankWordTokenizer()
print("TreebankWordTokenizer",tokenizer.tokenize(dataset))

OUTPUT
TreebankWordTokenizer ['Hello', 'Mr.', 'Watson', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The',
'weather', 'is', 'awsome.', 'The', 'garden', 'is', 'green.', 'We', 'should', 'go', 'out', 'for', 'a', 'walk', '.']

from nltk.tokenize import word_tokenize


text = "इस लेख में हम आपको इंटरनेट के बारे में सम्पू र्ण जानकारी देने
का प्रयास करेंगे। आज के आधुनिक युग में अधिकतर काम इंटरनेट के मा"

print("word_tokenize",word_tokenize(text))

OUTPUT
word_tokenize ['इस', 'लेख', 'में', 'हम', 'आपको', 'इंटरनेट', 'के', 'बारे',
'में', 'सम्पू', 'र्ण', 'जानकारी', 'देने', 'का', 'प्रयास',
'करेंगे।', 'आज', 'के', 'आधुनिक', 'युग', 'में', 'अधिकतर', 'काम',
'इंटरनेट', 'के', 'मा']

from nltk.tokenize import sent_tokenize


print("Sentence_tokenize",sent_tokenize(text))

OUTPUT
Sentence_tokenize ['इस लेख में हम आपको इंटरनेट के बारे में सम्पू
र्ण जानकारी देने का प्रयास करेंगे। आज के आधुनिक युग में
अधिकतर काम इंटरनेट के मा']

from nltk.tokenize import WordPunctTokenizer


#It seperates the punctuation from the words.
tokenizer = WordPunctTokenizer()
print("WordPunctTokenizer",tokenizer.tokenize(text))

OUTPUT
WordPunctTokenizer ['इस', 'ल', 'े', 'ख', 'म', 'ें', 'हम', 'आपक', 'ो', 'इ', 'ं', 'टरन', 'े', 'ट', 'क',
'े', 'ब', 'ा', 'र', 'े', 'म', 'ें', 'सम', '्', 'प', 'ू', 'र', '्', 'ण', 'ज', 'ा', 'नक', 'ा', 'र', 'ी', 'द', 'े',
'न', 'े', 'क', 'ा', 'प', '्', 'रय', 'ा', 'स', 'कर', 'ें', 'ग', 'े।', 'आज', 'क', 'े', 'आध', 'ु', 'न', 'ि', 'क',
'य', 'ु', 'ग', 'म', 'ें', 'अध', 'ि', 'कतर', 'क', 'ा', 'म', 'इ', 'ं', 'टरन', 'े', 'ट', 'क', 'े', 'म', 'ा']

from nltk.tokenize import TreebankWordTokenizer


#tokenizers work by separating the words using punctuation and
spaces.
tokenizer = TreebankWordTokenizer()
print("TreebankWordTokenizer",tokenizer.tokenize(text))

OUTPUT
TreebankWordTokenizer ['इस', 'लेख', 'में', 'हम', 'आपको', 'इंटरनेट', 'के', 'बारे',
'में', 'सम्पू', 'र्ण', 'जानकारी', 'देने', 'का', 'प्रयास', 'करेंगे।', 'आज',
'के', 'आधुनिक', 'युग', 'में', 'अधिकतर', 'काम', 'इंटरनेट', 'के', 'मा']

from nltk.tokenize import WordPunctTokenizer


#It seperates the punctuation from the words.
tokenizer = WordPunctTokenizer()
print("WordPunctTokenizer",tokenizer.tokenize(text))

OUTPUT

WordPunctTokenizer ['इस', 'ल', 'े', 'ख', 'म', 'ें', 'हम', 'आपक', 'ो', 'इ', 'ं', 'टरन', 'े', 'ट', 'क',
'े', 'ब', 'ा', 'र', 'े', 'म', 'ें', 'सम', '्', 'प', 'ू', 'र', '्', 'ण', 'ज', 'ा', 'नक', 'ा', 'र', 'ी', 'द', 'े',
'न', 'े', 'क', 'ा', 'प', '्', 'रय', 'ा', 'स', 'कर', 'ें', 'ग', 'े।', 'आज', 'क', 'े', 'आध', 'ु', 'न', 'ि', 'क',
'य', 'ु', 'ग', 'म', 'ें', 'अध', 'ि', 'कतर', 'क', 'ा', 'म', 'इ', 'ं', 'टरन', 'े', 'ट', 'क', 'े', 'म', 'ा']
Practical 2

Write a Program to identify Stopwards in a given sentence in English.

INPUT

import nltk
nltk.download('all')

import nltk
nltk.download('stopwords') # only if all are not downloaded
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

dataset = """They covered the precious mahogany coffin with a


brown amalgam of rocks, decomposed organisms, and w
It was my turn to take the shovel, but I felt too ashamed to
dutifully send her off when I had not p
I refused to throw dirt on her. I refused to let go of my
grandmother, to accept a death I had not s
to believe that an illness could not only interrupt, but steal a
beloved life."""

# Create a set of English Stop Words


stop_words = set(stopwords.words("english"))
print(stop_words)
print()
print("Total count of Stopwords :",len(stop_words))

OUTPUT

{'same', "they've", "he'd", 'hers', 'y', 'over', "should've", "mightn't", 'because', 'too', "that'll", 'an',
'am', 'while', 'can', "i'd", 'if', "she'll", 'any', "didn't", 'off', 'are', "mustn't", 'was', 'with', 'under', 'were',
'above', 'needn', 'should', 'him', 'they', 'through', 'he', 'again', 'its', 'more', 'herself', "won't", 'don',
"you'll", 'up', "you'd", 'll', 'ain', 'them', 'some', 'at', 'shan', 'will', "wouldn't", 'couldn', 'hadn', 'now',
'until', 'yours', 'ma', 'her', 'd', 'most', 'himself', 'doing', 'being', "i'm", 'yourself', 'for', "i'll", "hadn't",
'both', 'm', 'this', "you're", "needn't", 'there', "shouldn't", 'mightn', 'during', 'how', 'the', 'those', 'on',
"hasn't", 's', 'ourselves', 'you', "he's", 'is', 'as', 'than', 'theirs', 'we', 'o', 'or', 'i', 'she', 'his', 'own', 'a', 'do',
"it'd", 'shouldn', 'weren', "they'll", 'whom', 'had', "we'll", 'each', 'such', 'wasn', 'nor', 'doesn', 'these',
'between', "she'd", 'not', "he'll", "we'd", "don't", "we're", 'in', "she's", 'few', 'into', "doesn't", 'it',
"couldn't", 'but', 'when', "you've", 'your', 'yourselves', 'then', 'from', 'only', 'to', "i've", 'here', 'my',
'hasn', 'against', "they're", 'have', 'myself', 'before', 'mustn', 'aren', 'down', 'wouldn', 'ours', "it's",
"isn't", "it'll", 'why', 'other', 'won', 'isn', 'me', 'so', 'further', 'has', 've', "we've", 'by', 'didn', "shan't",
'did', 'been', 'no', "haven't", 'themselves', 'of', 'once', 'their', 'our', 'where', 'what', 're', "wasn't",
'having', 'after', 'which', 'be', "aren't", "they'd", "weren't", 'out', 'all', 'itself', 'about', 'that', 'and', 't',
'very', 'haven', 'below', 'who', 'just', 'does'} Total count of Stopwords : 198
words = word_tokenize(dataset)
print(words)
print()
print("Total words :",len(words))

OUTPUT

['They', 'covered', 'the', 'precious', 'mahogany', 'coffin', 'with', 'a', 'brown', 'amalgam', 'of', 'rocks',
',', 'decomposed', 'organisms', ',', 'and', 'w', 'It', 'was', 'my', 'turn', 'to', 'take', 'the', 'shovel', ',', 'but', 'I',
'felt', 'too', 'ashamed', 'to', 'dutifully', 'send', 'her', 'off', 'when', 'I', 'had', 'not', 'p', 'I', 'refused', 'to',
'throw', 'dirt', 'on', 'her', '.', 'I', 'refused', 'to', 'let', 'go', 'of', 'my', 'grandmother', ',', 'to', 'accept', 'a',
'death', 'I', 'had', 'not', 's', 'to', 'believe', 'that', 'an', 'illness', 'could', 'not', 'only', 'interrupt', ',', 'but',
'steal', 'a', 'beloved', 'life', '.'] Total words : 83

filtered_sentence = []
for w in words:
if w not in stop_words:
filtered_sentence.append(w)
print(filtered_sentence)
print()
print("After removing stopwords", len(filtered_sentence))

OUTPUT

filtered_sentence = []
for w in words:
if w not in stop_words:
filtered_sentence.append(w)
print(filtered_sentence)
print()
print("After removing stopwords", len(filtered_sentence))

OUTPUT

['They', 'covered', 'precious', 'mahogany', 'coffin', 'brown', 'amalgam', 'rocks', ',', 'decomposed',
'organisms', ',', 'w', 'It', 'turn', 'take', 'shovel', ',', 'I', 'felt', 'ashamed', 'dutifully', 'send', 'I', 'p', 'I',
'refused', 'throw', 'dirt', '.', 'I', 'refused', 'let', 'go', 'grandmother', ',', 'accept', 'death', 'I', 'believe',
'illness', 'could', 'interrupt', ',', 'steal', 'beloved', 'life', '.'] After removing stopwords 48
Practical 3

Write a program to perform Stemming and Lemmatization for English Text

INPUT

import nltk
nltk.download("all")

# import these modules


from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
# choose some words to be stemmed
words = ["program", "programs", "programmer", "programming",
"programmers"]
for w in words:
print(w, " : ", ps.stem(w))

OUTPUT

program : program
programs : program
programmer : programm
programming : program
programmers : programm

# importing modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
sentence = "Programmers program with programming languages"
words = word_tokenize(sentence)
for w in words:
print(w, " : ", ps.stem(w))

OUTPUT

Programmers : programm
program : program
with : with
programming : program
languages : languag
from nltk.stem import PorterStemmer
e_words= ["wait", "waiting", "waited", "waits"]
ps =PorterStemmer()
for w in e_words:
rootWord=ps.stem(w)
print(rootWord)

OUTPUT

wait
Wait
Wait
Wait

from nltk.stem import PorterStemmer


from nltk.tokenize import sent_tokenize, word_tokenize
sentence="Hello Kajal, You have to build a very good site and I
love visiting your site."
words = word_tokenize(sentence)
ps = PorterStemmer()
for w in words:
rootWord=ps.stem(w)
print(rootWord)

OUTPUT

hello kajal
,
you
have
To
Build
A
veri
good
site
and
I
Love
visit
your
Sit

#create an object of class PorterStemmer


from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
porter = PorterStemmer()
lancaster=LancasterStemmer()
#provide a word to be stemmed
print("Porter Stemmer")
print(porter.stem("cats"))
print(porter.stem("trouble"))
print(porter.stem("troubling"))
print(porter.stem("troubled"))
print()
print("Lancaster Stemmer")
print(lancaster.stem("cats"))
print(lancaster.stem("trouble"))
print(lancaster.stem("troubling"))
print(lancaster.stem("troubled"))

OUTPUT

Porter Stemmer
Cat
troubl
troubl
troubl
Lancaster Stemmer cat troubl troubl troubl

#A list of words to be stemmed


word_list = ["friend", "friendship", "friends",
"friendships","stabil","destabilize","misunderstanding","railroad
"]
print("{0:20}{1:20}{2:20}".format("Word","Porter
Stemmer","lancaster Stemmer"))
for word in word_list:
print("{0:20}{1:20}
{2:20}".format(word,porter.stem(word),lancaster.stem(word)))

OUTPUT

sentence="Pythoners are very intelligent and work very pythonly


and now they are pythoning their way to success."
porter.stem(sentence)

OUTPUT
‘pythoners are very intelligent and work very pythonly and now they are pythoning their way to
success.’

#stemming
import nltk
from nltk.stem.porter import PorterStemmer

porter_stemmer = PorterStemmer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
print("Stemming for {} is
{}".format(w,porter_stemmer.stem(w)))

OUTPUT:

Stemming for studies is studi


Stemming for studying is studi
Stemming for cries is cri
Stemming for cry is cri

# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
print("Lemma for {} is {}".format(w,
wordnet_lemmatizer.lemmatize(w)))

OUTPUT

Lemma for studies is study


Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry

Practical 4

Write a program to segregate Part of Speech (POS TAGGING) for English Text.

INPUT
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')

# Importing the libraries


import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# Importing the data


dataset = """Taj Mahal is one of the world’s most celebrated
structures
in the world.
It is a stunning symbol of Indian rich history"""

# Tokenize the data


new_data = word_tokenize(dataset)
new_data

OUTPUT

['Taj',
'Mahal',
'is',
'one',
'of',
'the',
'world',
'’',
's',
'most',
'celebrated',
'structures',
'in',
'the',
'world',
'.',
'It',
'is',
'a',
'stunning',
'symbol',
'of',
'Indian',
'rich',
'history']

# Tag Set
nltk.help.upenn_tagset()

OUTPUT
Practical 5

Write a program to perform Named Entity Recognition (NER) & Chunking on


English Text.
Named Entity Recognition
INPUT

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')

# Importing the libraries


from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

dataset = """Abraham Lincoln was an American statesman and lawyer


who served as the 16th President of the United States"""

# Tokenize and tagging the data


dataset_tag = pos_tag(word_tokenize(dataset))
dataset_tag

OUTPUT
[('Abraham', 'NNP'),
('Lincoln', 'NNP'),
('was', 'VBD'),
('an', 'DT'),
('American', 'JJ'),
('statesman', 'NN'),
('and', 'CC'),
('lawyer', 'NN'),
('who', 'WP'),
('served', 'VBD'),
('as', 'IN'),
('the', 'DT'),
('16th', 'CD'),
('President', 'NNP'),
('of', 'IN'),
('the', 'DT'),
('United', 'NNP'),
('States', 'NNPS')]

# Apply Named Entity Recognition with ne_chunk


data_ner = ne_chunk(dataset_tag)
print(data_ner)

OUTPUT
(S
(PERSON Abraham/NNP)
(PERSON Lincoln/NNP)
was/VBD
an/DT
(GPE American/JJ)
statesman/NN
and/CC
lawyer/NN
who/WP
served/VBD
as/IN
the/DT
16th/CD
President/NNP
of/IN
the/DT
(GPE United/NNP States/NNPS))

#Chunking

# Importing the libraries


import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser

dataset = """Taj Mahal is one of the world’s most celebrated


structures
in the world.
It is a stunning symbol of Indian rich history"""

# Tokenize the data


new_data = word_tokenize(dataset)
print(new_data)

OUTPUT

['Taj', 'Mahal', 'is', 'one', 'of', 'the', 'world', '’', 's', 'most', 'celebrated', 'structures', 'in',
'the', 'world', '.', 'It', 'is', 'a', 'stunning', 'symbol', 'of', 'Indian', 'rich', 'history']

# Apply the POS Tagging


postagging = pos_tag(new_data)
print(postagging)
OUTPUT

[('Taj', 'NNP'), ('Mahal', 'NNP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'),
('world', 'NN'), ('’', 'NNP'), ('s', 'VBZ'), ('most', 'JJS'), ('celebrated', 'JJ'), ('structures',
'NNS'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('a',
'DT'), ('stunning', 'JJ'), ('symbol', 'NN'), ('of', 'IN'), ('Indian', 'JJ'), ('rich', 'JJ'), ('history',
'NN')]

# Define the sequence of Chunk


sequence_chunk = """
chunk:
{<NNPS>+}
{<NNP>+}
{<NN>+} """

chunk = RegexpParser(sequence_chunk)

chunk_result = chunk.parse(postagging)
print(chunk_result)

OUTPUT

(S (chunk Taj/NNP Mahal/NNP) is/VBZ one/CD of/IN the/DT (chunk world/NN) (chunk ’/NNP)
s/VBZ most/JJS celebrated/JJ structures/NNS in/IN the/DT (chunk world/NN) ./. It/PRP is/VBZ
a/DT stunning/JJ (chunk symbol/NN) of/IN Indian/JJ rich/JJ (chunk history/NN))

You might also like