NLP Pratical
NLP Pratical
Write program to perform tokenization over word and sentence on English and
Hindi Text.
INPUT
import nltk
nltk.download('punkt_tab')
for i in sent_tokenize(dataset):
print(i)
OUTPUT
['Hello Mr. Watson, how are you doing today?', 'The weather is awsome.', 'The garden is green.',
'We should go out for a walk.'] Hello Mr. Watson, how are you doing today? The weather is
awsome. The garden is green. We should go out for a walk.
OUTPUT
['Hello', 'Mr.', 'Watson', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'awsome', '.',
'The', 'garden', 'is', 'green', '.', 'We', 'should', 'go', 'out', 'for', 'a', 'walk', '.']
OUTPUT
den', 'is', 'green', '.', 'We', 'should', 'go', 'out', 'for', 'a', 'walk', '.']
from nltk.tokenize import TreebankWordTokenizer
#tokenizers work by separating the words using punctuation and
spaces.
tokenizer = TreebankWordTokenizer()
print("TreebankWordTokenizer",tokenizer.tokenize(dataset))
OUTPUT
TreebankWordTokenizer ['Hello', 'Mr.', 'Watson', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The',
'weather', 'is', 'awsome.', 'The', 'garden', 'is', 'green.', 'We', 'should', 'go', 'out', 'for', 'a', 'walk', '.']
print("word_tokenize",word_tokenize(text))
OUTPUT
word_tokenize ['इस', 'लेख', 'में', 'हम', 'आपको', 'इंटरनेट', 'के', 'बारे',
'में', 'सम्पू', 'र्ण', 'जानकारी', 'देने', 'का', 'प्रयास',
'करेंगे।', 'आज', 'के', 'आधुनिक', 'युग', 'में', 'अधिकतर', 'काम',
'इंटरनेट', 'के', 'मा']
OUTPUT
Sentence_tokenize ['इस लेख में हम आपको इंटरनेट के बारे में सम्पू
र्ण जानकारी देने का प्रयास करेंगे। आज के आधुनिक युग में
अधिकतर काम इंटरनेट के मा']
OUTPUT
WordPunctTokenizer ['इस', 'ल', 'े', 'ख', 'म', 'ें', 'हम', 'आपक', 'ो', 'इ', 'ं', 'टरन', 'े', 'ट', 'क',
'े', 'ब', 'ा', 'र', 'े', 'म', 'ें', 'सम', '्', 'प', 'ू', 'र', '्', 'ण', 'ज', 'ा', 'नक', 'ा', 'र', 'ी', 'द', 'े',
'न', 'े', 'क', 'ा', 'प', '्', 'रय', 'ा', 'स', 'कर', 'ें', 'ग', 'े।', 'आज', 'क', 'े', 'आध', 'ु', 'न', 'ि', 'क',
'य', 'ु', 'ग', 'म', 'ें', 'अध', 'ि', 'कतर', 'क', 'ा', 'म', 'इ', 'ं', 'टरन', 'े', 'ट', 'क', 'े', 'म', 'ा']
OUTPUT
TreebankWordTokenizer ['इस', 'लेख', 'में', 'हम', 'आपको', 'इंटरनेट', 'के', 'बारे',
'में', 'सम्पू', 'र्ण', 'जानकारी', 'देने', 'का', 'प्रयास', 'करेंगे।', 'आज',
'के', 'आधुनिक', 'युग', 'में', 'अधिकतर', 'काम', 'इंटरनेट', 'के', 'मा']
OUTPUT
WordPunctTokenizer ['इस', 'ल', 'े', 'ख', 'म', 'ें', 'हम', 'आपक', 'ो', 'इ', 'ं', 'टरन', 'े', 'ट', 'क',
'े', 'ब', 'ा', 'र', 'े', 'म', 'ें', 'सम', '्', 'प', 'ू', 'र', '्', 'ण', 'ज', 'ा', 'नक', 'ा', 'र', 'ी', 'द', 'े',
'न', 'े', 'क', 'ा', 'प', '्', 'रय', 'ा', 'स', 'कर', 'ें', 'ग', 'े।', 'आज', 'क', 'े', 'आध', 'ु', 'न', 'ि', 'क',
'य', 'ु', 'ग', 'म', 'ें', 'अध', 'ि', 'कतर', 'क', 'ा', 'म', 'इ', 'ं', 'टरन', 'े', 'ट', 'क', 'े', 'म', 'ा']
Practical 2
INPUT
import nltk
nltk.download('all')
import nltk
nltk.download('stopwords') # only if all are not downloaded
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
OUTPUT
{'same', "they've", "he'd", 'hers', 'y', 'over', "should've", "mightn't", 'because', 'too', "that'll", 'an',
'am', 'while', 'can', "i'd", 'if', "she'll", 'any', "didn't", 'off', 'are', "mustn't", 'was', 'with', 'under', 'were',
'above', 'needn', 'should', 'him', 'they', 'through', 'he', 'again', 'its', 'more', 'herself', "won't", 'don',
"you'll", 'up', "you'd", 'll', 'ain', 'them', 'some', 'at', 'shan', 'will', "wouldn't", 'couldn', 'hadn', 'now',
'until', 'yours', 'ma', 'her', 'd', 'most', 'himself', 'doing', 'being', "i'm", 'yourself', 'for', "i'll", "hadn't",
'both', 'm', 'this', "you're", "needn't", 'there', "shouldn't", 'mightn', 'during', 'how', 'the', 'those', 'on',
"hasn't", 's', 'ourselves', 'you', "he's", 'is', 'as', 'than', 'theirs', 'we', 'o', 'or', 'i', 'she', 'his', 'own', 'a', 'do',
"it'd", 'shouldn', 'weren', "they'll", 'whom', 'had', "we'll", 'each', 'such', 'wasn', 'nor', 'doesn', 'these',
'between', "she'd", 'not', "he'll", "we'd", "don't", "we're", 'in', "she's", 'few', 'into', "doesn't", 'it',
"couldn't", 'but', 'when', "you've", 'your', 'yourselves', 'then', 'from', 'only', 'to', "i've", 'here', 'my',
'hasn', 'against', "they're", 'have', 'myself', 'before', 'mustn', 'aren', 'down', 'wouldn', 'ours', "it's",
"isn't", "it'll", 'why', 'other', 'won', 'isn', 'me', 'so', 'further', 'has', 've', "we've", 'by', 'didn', "shan't",
'did', 'been', 'no', "haven't", 'themselves', 'of', 'once', 'their', 'our', 'where', 'what', 're', "wasn't",
'having', 'after', 'which', 'be', "aren't", "they'd", "weren't", 'out', 'all', 'itself', 'about', 'that', 'and', 't',
'very', 'haven', 'below', 'who', 'just', 'does'} Total count of Stopwords : 198
words = word_tokenize(dataset)
print(words)
print()
print("Total words :",len(words))
OUTPUT
['They', 'covered', 'the', 'precious', 'mahogany', 'coffin', 'with', 'a', 'brown', 'amalgam', 'of', 'rocks',
',', 'decomposed', 'organisms', ',', 'and', 'w', 'It', 'was', 'my', 'turn', 'to', 'take', 'the', 'shovel', ',', 'but', 'I',
'felt', 'too', 'ashamed', 'to', 'dutifully', 'send', 'her', 'off', 'when', 'I', 'had', 'not', 'p', 'I', 'refused', 'to',
'throw', 'dirt', 'on', 'her', '.', 'I', 'refused', 'to', 'let', 'go', 'of', 'my', 'grandmother', ',', 'to', 'accept', 'a',
'death', 'I', 'had', 'not', 's', 'to', 'believe', 'that', 'an', 'illness', 'could', 'not', 'only', 'interrupt', ',', 'but',
'steal', 'a', 'beloved', 'life', '.'] Total words : 83
filtered_sentence = []
for w in words:
if w not in stop_words:
filtered_sentence.append(w)
print(filtered_sentence)
print()
print("After removing stopwords", len(filtered_sentence))
OUTPUT
filtered_sentence = []
for w in words:
if w not in stop_words:
filtered_sentence.append(w)
print(filtered_sentence)
print()
print("After removing stopwords", len(filtered_sentence))
OUTPUT
['They', 'covered', 'precious', 'mahogany', 'coffin', 'brown', 'amalgam', 'rocks', ',', 'decomposed',
'organisms', ',', 'w', 'It', 'turn', 'take', 'shovel', ',', 'I', 'felt', 'ashamed', 'dutifully', 'send', 'I', 'p', 'I',
'refused', 'throw', 'dirt', '.', 'I', 'refused', 'let', 'go', 'grandmother', ',', 'accept', 'death', 'I', 'believe',
'illness', 'could', 'interrupt', ',', 'steal', 'beloved', 'life', '.'] After removing stopwords 48
Practical 3
INPUT
import nltk
nltk.download("all")
OUTPUT
program : program
programs : program
programmer : programm
programming : program
programmers : programm
# importing modules
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
ps = PorterStemmer()
sentence = "Programmers program with programming languages"
words = word_tokenize(sentence)
for w in words:
print(w, " : ", ps.stem(w))
OUTPUT
Programmers : programm
program : program
with : with
programming : program
languages : languag
from nltk.stem import PorterStemmer
e_words= ["wait", "waiting", "waited", "waits"]
ps =PorterStemmer()
for w in e_words:
rootWord=ps.stem(w)
print(rootWord)
OUTPUT
wait
Wait
Wait
Wait
OUTPUT
hello kajal
,
you
have
To
Build
A
veri
good
site
and
I
Love
visit
your
Sit
OUTPUT
Porter Stemmer
Cat
troubl
troubl
troubl
Lancaster Stemmer cat troubl troubl troubl
OUTPUT
OUTPUT
‘pythoners are very intelligent and work very pythonly and now they are pythoning their way to
success.’
#stemming
import nltk
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
print("Stemming for {} is
{}".format(w,porter_stemmer.stem(w)))
OUTPUT:
# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
print("Lemma for {} is {}".format(w,
wordnet_lemmatizer.lemmatize(w)))
OUTPUT
Practical 4
Write a program to segregate Part of Speech (POS TAGGING) for English Text.
INPUT
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
OUTPUT
['Taj',
'Mahal',
'is',
'one',
'of',
'the',
'world',
'’',
's',
'most',
'celebrated',
'structures',
'in',
'the',
'world',
'.',
'It',
'is',
'a',
'stunning',
'symbol',
'of',
'Indian',
'rich',
'history']
# Tag Set
nltk.help.upenn_tagset()
OUTPUT
Practical 5
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
OUTPUT
[('Abraham', 'NNP'),
('Lincoln', 'NNP'),
('was', 'VBD'),
('an', 'DT'),
('American', 'JJ'),
('statesman', 'NN'),
('and', 'CC'),
('lawyer', 'NN'),
('who', 'WP'),
('served', 'VBD'),
('as', 'IN'),
('the', 'DT'),
('16th', 'CD'),
('President', 'NNP'),
('of', 'IN'),
('the', 'DT'),
('United', 'NNP'),
('States', 'NNPS')]
OUTPUT
(S
(PERSON Abraham/NNP)
(PERSON Lincoln/NNP)
was/VBD
an/DT
(GPE American/JJ)
statesman/NN
and/CC
lawyer/NN
who/WP
served/VBD
as/IN
the/DT
16th/CD
President/NNP
of/IN
the/DT
(GPE United/NNP States/NNPS))
#Chunking
OUTPUT
['Taj', 'Mahal', 'is', 'one', 'of', 'the', 'world', '’', 's', 'most', 'celebrated', 'structures', 'in',
'the', 'world', '.', 'It', 'is', 'a', 'stunning', 'symbol', 'of', 'Indian', 'rich', 'history']
[('Taj', 'NNP'), ('Mahal', 'NNP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'),
('world', 'NN'), ('’', 'NNP'), ('s', 'VBZ'), ('most', 'JJS'), ('celebrated', 'JJ'), ('structures',
'NNS'), ('in', 'IN'), ('the', 'DT'), ('world', 'NN'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('a',
'DT'), ('stunning', 'JJ'), ('symbol', 'NN'), ('of', 'IN'), ('Indian', 'JJ'), ('rich', 'JJ'), ('history',
'NN')]
chunk = RegexpParser(sequence_chunk)
chunk_result = chunk.parse(postagging)
print(chunk_result)
OUTPUT
(S (chunk Taj/NNP Mahal/NNP) is/VBZ one/CD of/IN the/DT (chunk world/NN) (chunk ’/NNP)
s/VBZ most/JJS celebrated/JJ structures/NNS in/IN the/DT (chunk world/NN) ./. It/PRP is/VBZ
a/DT stunning/JJ (chunk symbol/NN) of/IN Indian/JJ rich/JJ (chunk history/NN))