0% found this document useful (0 votes)
356 views17 pages

Natural Language Processing

The document discusses various natural language processing tasks in Python including tokenization, stop word removal, stemming, part-of-speech tagging, morphological analysis, n-gram generation, and n-gram smoothing. Code examples with outputs are provided to demonstrate how to perform tokenization, stop word removal, stemming with PorterStemmer, word sense disambiguation with Lesk, part-of-speech tagging, morphological analysis with NLTK, n-gram generation with NLTK ngrams library, and n-gram smoothing.

Uploaded by

coding ak
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
356 views17 pages

Natural Language Processing

The document discusses various natural language processing tasks in Python including tokenization, stop word removal, stemming, part-of-speech tagging, morphological analysis, n-gram generation, and n-gram smoothing. Code examples with outputs are provided to demonstrate how to perform tokenization, stop word removal, stemming with PorterStemmer, word sense disambiguation with Lesk, part-of-speech tagging, morphological analysis with NLTK, n-gram generation with NLTK ngrams library, and n-gram smoothing.

Uploaded by

coding ak
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 17

1.

Write a Python Program to perform following tasks on text a)


Tokenization b) Stop word Removal
import nltk

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

def preprocess_text(text):

# Tokenization

tokens = word_tokenize(text)

# Removing stop words

stop_words = set(stopwords.words('english'))

filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

return filtered_tokens

def main():

text = "NLTK is a leading platform for building Python programs to work with
human language data."

preprocessed_text = preprocess_text(text)

print("Original Text:")

print(text)

print("\nTokenized Text:")

print(preprocessed_text)
if __name__ == "__main__":

main()

Output:-

Original Text:

NLTK is a leading platform for building Python programs to work with


human language data.

Tokenized Text:

['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human',


'language', 'data', '.']

2. Write a Python program to implement Porter stemmer algorithm for


stemming
import nltk

from nltk.tokenize import word_tokenize

from nltk.corpus import stopwords

from nltk.stem import PorterStemmer

def preprocess_text(text):

# Tokenization

tokens = word_tokenize(text)

# Removing stop words

stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in
stop_words]

return filtered_tokens

def apply_stemming(tokens):

porter = PorterStemmer()

stemmed_tokens = [porter.stem(token) for token in tokens]

return stemmed_tokens

def main():

text = "NLTK is a leading platform for building Python programs to


work with human language data."

preprocessed_text = preprocess_text(text)

stemmed_text = apply_stemming(preprocessed_text)

print("Original Text:")

print(text)

print("\nTokenized Text:")

print(preprocessed_text)

print("\nStemmed Text:")

print(stemmed_text)

if __name__ == "__main__":

main()
Output:-
Original Text:

NLTK is a leading platform for building Python programs to work with human
language data.

Tokenized Text:

['NLTK', 'leading', 'platform', 'building', 'Python', 'programs', 'work', 'human',


'language', 'data', '.']

Stemmed Text:

['nltk', 'lead', 'platform', 'build', 'python', 'program', 'work', 'human', 'languag', 'data',
'.']

3. Write Python Program for a) Word Analysis b) Word Generation


with output.
import nltk

from nltk.corpus import brown

def word_analysis():

# Load the Brown corpus

nltk.download('brown')

words = brown.words()

# Calculate word frequency

freq_dist = nltk.FreqDist(words)
# Print 10 most common words

print("10 Most Common Words:")

print(freq_dist.most_common(10))

def word_generation():

# Load the Brown corpus

nltk.download('brown')

words = brown.words()

# Generate words using bigrams

bigrams = nltk.bigrams(words)

word_dict = {}

for w1, w2 in bigrams:

if w1 not in word_dict:

word_dict[w1] = []

word_dict[w1].append(w2)

# Generate a sentence

import random

sentence = []

current_word = random.choice(list(word_dict.keys()))

sentence.append(current_word)

for _ in range(10):

next_word = random.choice(word_dict[current_word])

sentence.append(next_word)
current_word = next_word

# Print the generated sentence

print("\nGenerated Sentence:")

print(' '.join(sentence))

def main():

print("Word Analysis:")

word_analysis()

print("\nWord Generation:")

word_generation()

if __name__ == "__main__":

main()

Output:-

Word Analysis:

10 Most Common Words:

[('the', 62713), (',', 58334), ('.', 49346), ('of', 36080), ('and', 27915), ('to', 25732), ('a',
21881), ('in', 19536), ('that', 10237), ('is', 10011)]

Word Generation:

Generated Sentence:

combination of radiologist in their own issues for financing their ability to create a
different thing . And in contrast to learn to the games where you have been
4. Create a Sample list for at least 5 words with ambiguous sense and
Write a Python program to implement WSD
from nltk.wsd import lesk

from nltk.tokenize import word_tokenize

def wsd(sample_sentences):

for sentence in sample_sentences:

words = word_tokenize(sentence)

for word in words:

synset = lesk(words, word)

if synset is not None:

print("Word:", word)

print("Definition:", synset.definition())

print("Example:", synset.examples())

print("-------------------------------------------------")

def main():

sample_sentences = [

"The bank can guarantee deposits will eventually cover future tuition costs
because it invests in adjustable-rate mortgage securities.",

"I went to the bank to deposit my money.",

"The bark of the tree was rough.",

"I heard a loud bark from the dog.",

"I need to address the issue with the address provided."

]
wsd(sample_sentences)

if __name__ == "__main__":

main()

Output:-

Word: bank

Definition: a financial institution that accepts deposits and channels the


money into lending activities

Example: ['he cashed a check at the bank', 'that bank holds the mortgage
on my home']

-------------------------------------------------

Word: bank

Definition: a financial institution where money is kept for saving or


commercial purposes or is invested, supplied for loans, or exchanged.

Example: ['he cashed a check at the bank', 'that bank holds the mortgage
on my home']

-------------------------------------------------

Word: bark

Definition: the sound made by a dog

Example: ['the dog's barking kept me awake all night']

-------------------------------------------------

Word: bark

Definition: tough protective covering of the woody stems and roots of trees
and other woody plants

Example: ['it was stripped of bark']

-------------------------------------------------
Word: address

Definition: the place where a person or organization can be found or


communicated with

Example: ['he didn't leave an address', 'my address is 123 Main Street']

-------------------------------------------------

Word: address

Definition: give a speech to

Example: ['The chairman addressed the board of trustees']

-------------------------------------------------

5. Install NLTK tool kit and perform stemming

import nltk

nltk.download('punkt')

nltk.download('stopwords')

nltk.download('wordnet')

from nltk.stem import PorterStemmer

from nltk.tokenize import word_tokenize

# Sample text

text = "It is important to be very pythonly while you are


pythoning with python. All pythoners have pythoned poorly at
least once."
# Tokenize the text

words = word_tokenize(text)

# Create a PorterStemmer object

porter = PorterStemmer()

# Stem each word in the text

stemmed_words = [porter.stem(word) for word in words]

# Print the stemmed words

print("Original text:")

print(text)

print("\nStemmed text:")

print(" ".join(stemmed_words))

output:-

Original text:

It is important to be very pythonly while you are pythoning with


python. All pythoners have pythoned poorly at least once.

Stemmed text:

It is import to be veri pythonli while you are python with python .


all python have python poorli at least onc .
6. Create Sample list of at least 10 words POS tagging and find the
POS for any given word

import nltk

# Sample list of words


sample_words = ["Python", "Programming", "Language", "is", "widely", "used",
"for", "developing", "various", "applications"]

# Perform POS tagging


pos_tags = nltk.pos_tag(sample_words)

# Function to find POS for a given word


def find_pos(word):
for w, pos in pos_tags:
if w.lower() == word.lower():
return pos
return "POS not found"

# Test the function with a given word


given_word = "Python"
pos = find_pos(given_word)
print(f"POS tag for '{given_word}': {pos}")

Output:-

POS tag for 'Python': NN


7. Write a Python program to

a) Perform Morphological Analysis using NLTK library

b) Generate n-grams using NLTK N-Grams library

c) Implement N-Grams Smoothing also give me output

import nltk

from nltk.util import ngrams

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

from collections import Counter

import math

def morphological_analysis(text):

# Tokenize the text

tokens = nltk.word_tokenize(text)

# Remove stopwords

stop_words = set(stopwords.words('english'))

filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Perform lemmatization

lemmatizer = WordNetLemmatizer()

lemmas = [lemmatizer.lemmatize(token) for token in filtered_tokens]


return lemmas

def generate_ngrams(text, n):

# Tokenize the text

tokens = nltk.word_tokenize(text)

# Generate n-grams

n_grams = list(ngrams(tokens, n))

return n_grams

def calculate_ngram_smoothing(n_grams):

# Count occurrences of n-grams

n_gram_counts = Counter(n_grams)

# Calculate probabilities with Laplace smoothing

n_gram_probabilities = {}

for n_gram in n_gram_counts:

context = n_gram[:-1]

context_count = sum(1 for ng in n_grams if ng[:-1] == context)

probability = (n_gram_counts[n_gram] + 1) / (context_count + len(n_gram_counts))

n_gram_probabilities[n_gram] = probability
return n_gram_probabilities

def main():

text = "The quick brown fox jumps over the lazy dog."

print("Original Text:", text)

# a) Morphological Analysis

morph_analysis_result = morphological_analysis(text)

print("\nMorphological Analysis:", morph_analysis_result)

# b) Generate n-grams

n=3

n_grams = generate_ngrams(text, n)

print("\n{}-grams:".format(n), n_grams)

# c) N-Grams Smoothing

n_gram_probabilities = calculate_ngram_smoothing(n_grams)

print("\nN-Gram Probabilities (with Laplace smoothing):", n_gram_probabilities)

if __name__ == "__main__":

main()

Output:-
Original Text: The quick brown fox jumps over the lazy dog.

Morphological Analysis: ['The', 'quick', 'brown', 'fox', 'jump', 'lazy', 'dog', '.']

3-grams: [('The', 'quick', 'brown'), ('quick', 'brown', 'fox'), ('brown', 'fox', 'jumps'),
('fox', 'jumps', 'lazy'), ('jumps', 'lazy', 'dog'), ('lazy', 'dog', '.')]

N-Gram Probabilities (with Laplace smoothing): {('The', 'quick', 'brown'):


0.16666666666666666, ('quick', 'brown', 'fox'): 0.16666666666666666, ('brown',
'fox', 'jumps'): 0.16666666666666666, ('fox', 'jumps', 'lazy'): 0.16666666666666666,
('jumps', 'lazy', 'dog'): 0.16666666666666666, ('lazy', 'dog', '.'):
0.16666666666666666}

8. Using NLTK package to convert audio file to text and text file to
audio files.
import speech_recognition as sr

import pyttsx3

def audio_to_text(audio_file):

# Initialize the recognizer

recognizer = sr.Recognizer()

# Load the audio file

with sr.AudioFile(audio_file) as source:

audio_data = recognizer.record(source)
# Convert audio to text

try:

text = recognizer.recognize_google(audio_data)

return text

except sr.UnknownValueError:

return "Speech Recognition could not understand audio"

except sr.RequestError as e:

return f"Could not request results from Speech Recognition service; {e}"

def text_to_audio(text, output_file):

# Initialize the Text-to-Speech engine

engine = pyttsx3.init()

# Save the text to an audio file

engine.save_to_file(text, output_file)

engine.runAndWait()

if __name__ == "__main__":

# Audio file to text

audio_file = "audio_sample.wav"

text = audio_to_text(audio_file)

print("Text from audio:", text)

# Text to audio

output_file = "output_audio.wav"
text_to_audio(text, output_file)

print("Text converted to audio")

Output:-

Text from audio: hello how are you

Text converted to audio

You might also like