Natural Language Processing
Natural Language Processing
def preprocess_text(text):
# Tokenization
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
return filtered_tokens
def main():
text = "NLTK is a leading platform for building Python programs to work with
human language data."
preprocessed_text = preprocess_text(text)
print("Original Text:")
print(text)
print("\nTokenized Text:")
print(preprocessed_text)
if __name__ == "__main__":
main()
Output:-
Original Text:
Tokenized Text:
def preprocess_text(text):
# Tokenization
tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in
stop_words]
return filtered_tokens
def apply_stemming(tokens):
porter = PorterStemmer()
return stemmed_tokens
def main():
preprocessed_text = preprocess_text(text)
stemmed_text = apply_stemming(preprocessed_text)
print("Original Text:")
print(text)
print("\nTokenized Text:")
print(preprocessed_text)
print("\nStemmed Text:")
print(stemmed_text)
if __name__ == "__main__":
main()
Output:-
Original Text:
NLTK is a leading platform for building Python programs to work with human
language data.
Tokenized Text:
Stemmed Text:
['nltk', 'lead', 'platform', 'build', 'python', 'program', 'work', 'human', 'languag', 'data',
'.']
def word_analysis():
nltk.download('brown')
words = brown.words()
freq_dist = nltk.FreqDist(words)
# Print 10 most common words
print(freq_dist.most_common(10))
def word_generation():
nltk.download('brown')
words = brown.words()
bigrams = nltk.bigrams(words)
word_dict = {}
if w1 not in word_dict:
word_dict[w1] = []
word_dict[w1].append(w2)
# Generate a sentence
import random
sentence = []
current_word = random.choice(list(word_dict.keys()))
sentence.append(current_word)
for _ in range(10):
next_word = random.choice(word_dict[current_word])
sentence.append(next_word)
current_word = next_word
print("\nGenerated Sentence:")
print(' '.join(sentence))
def main():
print("Word Analysis:")
word_analysis()
print("\nWord Generation:")
word_generation()
if __name__ == "__main__":
main()
Output:-
Word Analysis:
[('the', 62713), (',', 58334), ('.', 49346), ('of', 36080), ('and', 27915), ('to', 25732), ('a',
21881), ('in', 19536), ('that', 10237), ('is', 10011)]
Word Generation:
Generated Sentence:
combination of radiologist in their own issues for financing their ability to create a
different thing . And in contrast to learn to the games where you have been
4. Create a Sample list for at least 5 words with ambiguous sense and
Write a Python program to implement WSD
from nltk.wsd import lesk
def wsd(sample_sentences):
words = word_tokenize(sentence)
print("Word:", word)
print("Definition:", synset.definition())
print("Example:", synset.examples())
print("-------------------------------------------------")
def main():
sample_sentences = [
"The bank can guarantee deposits will eventually cover future tuition costs
because it invests in adjustable-rate mortgage securities.",
]
wsd(sample_sentences)
if __name__ == "__main__":
main()
Output:-
Word: bank
Example: ['he cashed a check at the bank', 'that bank holds the mortgage
on my home']
-------------------------------------------------
Word: bank
Example: ['he cashed a check at the bank', 'that bank holds the mortgage
on my home']
-------------------------------------------------
Word: bark
-------------------------------------------------
Word: bark
Definition: tough protective covering of the woody stems and roots of trees
and other woody plants
-------------------------------------------------
Word: address
Example: ['he didn't leave an address', 'my address is 123 Main Street']
-------------------------------------------------
Word: address
-------------------------------------------------
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Sample text
words = word_tokenize(text)
porter = PorterStemmer()
print("Original text:")
print(text)
print("\nStemmed text:")
print(" ".join(stemmed_words))
output:-
Original text:
Stemmed text:
import nltk
Output:-
import nltk
import math
def morphological_analysis(text):
tokens = nltk.word_tokenize(text)
# Remove stopwords
stop_words = set(stopwords.words('english'))
# Perform lemmatization
lemmatizer = WordNetLemmatizer()
tokens = nltk.word_tokenize(text)
# Generate n-grams
return n_grams
def calculate_ngram_smoothing(n_grams):
n_gram_counts = Counter(n_grams)
n_gram_probabilities = {}
context = n_gram[:-1]
n_gram_probabilities[n_gram] = probability
return n_gram_probabilities
def main():
text = "The quick brown fox jumps over the lazy dog."
# a) Morphological Analysis
morph_analysis_result = morphological_analysis(text)
# b) Generate n-grams
n=3
n_grams = generate_ngrams(text, n)
print("\n{}-grams:".format(n), n_grams)
# c) N-Grams Smoothing
n_gram_probabilities = calculate_ngram_smoothing(n_grams)
if __name__ == "__main__":
main()
Output:-
Original Text: The quick brown fox jumps over the lazy dog.
Morphological Analysis: ['The', 'quick', 'brown', 'fox', 'jump', 'lazy', 'dog', '.']
3-grams: [('The', 'quick', 'brown'), ('quick', 'brown', 'fox'), ('brown', 'fox', 'jumps'),
('fox', 'jumps', 'lazy'), ('jumps', 'lazy', 'dog'), ('lazy', 'dog', '.')]
8. Using NLTK package to convert audio file to text and text file to
audio files.
import speech_recognition as sr
import pyttsx3
def audio_to_text(audio_file):
recognizer = sr.Recognizer()
audio_data = recognizer.record(source)
# Convert audio to text
try:
text = recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
except sr.RequestError as e:
return f"Could not request results from Speech Recognition service; {e}"
engine = pyttsx3.init()
engine.save_to_file(text, output_file)
engine.runAndWait()
if __name__ == "__main__":
audio_file = "audio_sample.wav"
text = audio_to_text(audio_file)
# Text to audio
output_file = "output_audio.wav"
text_to_audio(text, output_file)
Output:-