R22 Nlp Python Programs
R22 Nlp Python Programs
Select command prompt and type text as python –version and click on enter
If any package not found in nltk library go to IDLE shell type below commands and click on enter
import nltk
nltk.download()
Example:
nltk.download('stopwords')
a) Tokenization
Word tokenization:
import nltk
word_data = "It originated from the idea that there are readers who prefer learning new skills
from the comforts of their drawing rooms"
nltk_tokens = nltk.word_tokenize(word_data)
print (nltk_tokens)
Output:
['It', 'originated', 'from', 'the', 'idea', 'that', 'there', 'are', 'readers', 'who', 'prefer', 'learning',
'new', 'skills', 'from', 'the', 'comforts', 'of', 'their', 'drawing', 'rooms']
Sentence tokenization:
import nltk
sentence_data = "The First sentence is about Python. The Second: about Django. You can learn
Python,Django and Data Ananlysis here. "
nltk_tokens = nltk.sent_tokenize(sentence_data)
print (nltk_tokens)
Output:
['The First sentence is about Python.', 'The Second: about Django.', 'You can learn
Python,Django and Data Ananlysis here.']
Character tokenization
Import nltk
charact_tokens=list(charact_data)
print(charact_tokens)
Output:
['P', 'y', 't', 'h', 'o', 'n', 'p', 'r', 'o', 'g', 'r', 'a', 'm', 'm', 'i', 'n', 'g'
en_stops = set(stopwords.words('english'))
print(word)
Output:
There
tree
near
river
import nltk
nltk.download('punkt')
stemmer=PorterStemmer()
words=["running","beautifulness","rivers","caresses","happily","studies","banking"]
print("Original Words:",words)
print("Stemmed Words",stemmed_words)
Output:
a) Word Analysis
import re
def word_analysis(text):
text = text.lower()
word_freq = Counter(words)
most_common_words = word_freq.most_common(10)
text = "This is an example sentence for word analysis. This sentence is just an example."
print("Word Frequency:")
print(f"{word}: {freq}")
print("\nWord Length:")
print(f"{word}: {length}")
print(f"{word}: {freq}")
Output:
Word Frequency:
this: 2
is: 2
an: 2
example: 2
sentence: 2
for: 1
word: 1
analysis: 1
just: 1
Word Length:
this: 4
is: 2
an: 2
example: 7
sentence: 8
for: 3
word: 4
analysis: 8
just: 4
this: 2
is: 2
an: 2
example: 2
sentence: 2
for: 1
word: 1
analysis: 1
just: 1
b) Word Generation
import random
import nltk
nltk.download('wordnet')
synsets = list(wordnet.all_synsets(part_of_speech))
words = []
for _ in range(num_words):
synset = random.choice(synsets)
lemma = random.choice(synset.lemmas())
words.append(lemma.name())
return words
print("Nouns:")
print(noun)
print("\nVerbs:")
print(verb)
print("\nAdjectives:")
print(adjective)
print("\nAdverbs:")
print(adverb)
Output:
Nouns:
Haastia_pulvinaris
televangelist
genus_Estrilda
E._H._Weber
insidiousness
Evangelical_and_Reformed_Church
garnishee
semigloss
powder_keg
townspeople
Verbs:
encapsulate
remain
salve
cruise
credit
charge
drone_on
up
fume
sandblast
Adjectives:
stipendiary
reportable
stilly
live
adscititious
bindable
upper-class
god-awful
organized
untechnical
Adverbs:
pitty-patty
naturally
managerially
smartly
providently
dumbly
worse
tight
magniloquently
pointlessly
4. Create a sample list of at least 5 words with ambiguous sense and write a python program
to implement WSD.
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
contexts = {
if sense:
else:
# Iterate over each ambiguous word and print its disambiguated sense based on context
context = contexts[word]
print(f"Word: {word}")
print(f"Context: {context}")
print("-" * 50)
Output:
Word: bank
--------------------------------------------------
Word: bat
--------------------------------------------------
Word: bark
--------------------------------------------------
Word: pitch
--------------------------------------------------
Word: lead
--------------------------------------------------
import nltk
print(stemmed_words)
Output:
6. Create Sample list of at least 10 words POS tagging and find the POS for any given word
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def find_pos_tag(word):
tokens = word_tokenize(word)
pos_tags = pos_tag(tokens)
return pos_tags[0][1]
pos_tag = find_pos_tag(word)
Output:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
def morphological_analysis(text):
tokens = word_tokenize(text)
tagged_tokens = pos_tag(tokens)
lemmatized_tokens = []
if tag.startswith('J'):
wordnet_tag = 'a'
elif tag.startswith('V'):
wordnet_tag = 'v'
elif tag.startswith('N'):
wordnet_tag = 'n'
elif tag.startswith('R'):
wordnet_tag = 'r'
else:
wordnet_tag = ''
if wordnet_tag:
else:
lemmatized_token = token
lemmatized_tokens.append(lemmatized_token)
return lemmatized_tokens
text = "The quick brown fox jumps over the lazy dog."
print("Original Text:")
print(text)
print("\nLemmatized Tokens:")
print(morphological_analysis(text))
Output:
Original Text:
Lemmatized Tokens:
['The', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '.']