0% found this document useful (0 votes)
4 views

Programs code

The document contains multiple Python programs utilizing the NLTK library for natural language processing tasks such as text preprocessing, tokenization, part-of-speech tagging, named entity recognition, and typo correction. Key functionalities include counting words and unique words in Moby Dick, generating TF-IDF matrices, and enhancing customer messages by correcting typos. The programs also demonstrate stemming and lemmatization techniques on titles extracted from a JSONL file.

Uploaded by

mickeypinky123
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views

Programs code

The document contains multiple Python programs utilizing the NLTK library for natural language processing tasks such as text preprocessing, tokenization, part-of-speech tagging, named entity recognition, and typo correction. Key functionalities include counting words and unique words in Moby Dick, generating TF-IDF matrices, and enhancing customer messages by correcting typos. The programs also demonstrate stemming and lemmatization techniques on titles extracted from a JSONL file.

Uploaded by

mickeypinky123
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 7

Program 1

# Import necessary libraries


import nltk
from nltk.corpus import gutenberg, stopwords
import string

# Download necessary NLTK resources


nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('punkt')

# Load the text of Moby Dick from the Gutenberg corpus


raw_text = gutenberg.raw('melville-moby_dick.txt')

# Define a preprocessing function to clean and tokenize the text


def preprocess_text(text):
# Convert text to lowercase
text = text.lower()

# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))

# Tokenize the text into words


words = nltk.word_tokenize(text)

# Remove stop words


stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

return filtered_words

# Preprocess the text


processed_text = preprocess_text(raw_text)

# Count the number of documents (we have only one document here:
Moby Dick)
document_count = 1
# Count the total number of words and unique words
word_count = len(processed_text)
unique_words = set(processed_text)
unique_word_count = len(unique_words)

# Display the results


print("Document Count:", document_count)
print("Total Words in Corpus:", word_count)
print("Unique Words in Corpus:", unique_word_count)

Program 3

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
corpus = ["The quick brown fox jumps over the lazy dog."]
tokenized_corpus = [word_tokenize(sentence) for sentence in corpus]
flat_corpus = [word.lower() for sentence in tokenized_corpus for word in
sentence]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(flat_corpus)
pos_tags = pos_tag(flat_corpus)
named_entities = ne_chunk(pos_tags)
grammar = r"""
NP: {<DT|JJ|NN.*>+}
VP: {<VB.*>+}
PP: {<IN> <NP>}
"""
chunk_parser = nltk.RegexpParser(grammar)
chunked_sentences = chunk_parser.parse(pos_tags)
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("\nPart-of-Speech Tags:")
print(pos_tags)
print("\nNamed Entities:")
print(named_entities)
print("\nChunked Sentences:")
print(chunked_sentences)

Program 4:

!Pip install Textblob

from textblob import TextBlob

def correct_typos(text):

# Create a TextBlob object for spell-checking

blob = TextBlob(text)

# Correct typos using TextBlob

return str(blob.correct())

def enhance_message(message):

# Correct typos and grammatical errors

corrected_message = correct_typos(message)

return corrected_message

# Example usage

if __name__ == "__main__":

# Example customer message with typos and grammatical errors

customer_message = "u r looking awsome"

# Enhance the customer message

enhanced_message = enhance_message(customer_message)

# Print the corrected message


print("Original Message: ", customer_message)

print("Enhanced Message: ", enhanced_message)

program 5:

import nltk

from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('wordnet')

nltk.download('omw-1.4')

import json

import re

# %%

stemmer = PorterStemmer()

lemmatizer = WordNetLemmatizer()

# %%

def stem_text(text):

words = nltk.word_tokenize(text)

return ' '.join(stemmer.stem(word) for word in words)

def lemmatize_text(text):

words = nltk.word_tokenize(text)

return ' '.join(lemmatizer.lemmatize(word) for word in words)

# %%

file_path = "dataset.jsonl"

# %%

titles = []
with open(file_path, 'r') as file:

for line in file:

data = json.loads(line)

titles.append(data.get('metadata', {}).get('title'))

# %%

titles = [re.sub(r'\n', '', t) for t in titles]

print(titles[:5])

# %%

titles = [t.lower() for t in titles]

print(titles[:5])

# %%

stem_titles = [stem_text(t) for t in titles]

lemm_titles = [lemmatize_text(t) for t in titles]

print(stem_titles[:5])

print(lemm_titles[:5])

# %%

keyword = "fluid"

keyword = keyword.lower()

stem_matches = []

lemm_matches = []
# Search in stemmed titles

for stem_title in stem_titles:

if keyword in stem_title.lower():

stem_matches.append(stem_title)

# Search in lemmatized titles

for lemm_title in lemm_titles:

if keyword in lemm_title.lower():

lemm_matches.append(lemm_title)

print("Matches in stemmed titles:", len(stem_matches))

print("Matches in lemmatized titles:", len(lemm_matches))

# %%

for i in stem_matches:

print(i)

# %%

for j in lemm_matches:

print(j)

Program 6:

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
def extract_named_entities(text):
sentences = sent_tokenize(text)
named_entities = []
for sentence in sentences:
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)
chunks = ne_chunk(pos_tags)
for chunk in chunks:
if hasattr(chunk, 'label'):
entity_type = chunk.label()
entity_name = ' '.join([c[0] for c in chunk])
named_entities.append((entity_name, entity_type))
return named_entities
text = """President Joe Biden is scheduled to meet with Ukrainian
President Volodymyr Zelenskyy at the White House on Tuesday.
The meeting comes amid ongoing discussions about military aid and
support for Ukraine's defense against Russian aggression."""
named_entities = extract_named_entities(text)
for entity_name, entity_type in named_entities:
print(f"Entity: {entity_name}, Type: {entity_type}")

You might also like