Programs code
Programs code
# Remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
return filtered_words
# Count the number of documents (we have only one document here:
Moby Dick)
document_count = 1
# Count the total number of words and unique words
word_count = len(processed_text)
unique_words = set(processed_text)
unique_word_count = len(unique_words)
Program 3
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
corpus = ["The quick brown fox jumps over the lazy dog."]
tokenized_corpus = [word_tokenize(sentence) for sentence in corpus]
flat_corpus = [word.lower() for sentence in tokenized_corpus for word in
sentence]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(flat_corpus)
pos_tags = pos_tag(flat_corpus)
named_entities = ne_chunk(pos_tags)
grammar = r"""
NP: {<DT|JJ|NN.*>+}
VP: {<VB.*>+}
PP: {<IN> <NP>}
"""
chunk_parser = nltk.RegexpParser(grammar)
chunked_sentences = chunk_parser.parse(pos_tags)
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("\nPart-of-Speech Tags:")
print(pos_tags)
print("\nNamed Entities:")
print(named_entities)
print("\nChunked Sentences:")
print(chunked_sentences)
Program 4:
def correct_typos(text):
blob = TextBlob(text)
return str(blob.correct())
def enhance_message(message):
corrected_message = correct_typos(message)
return corrected_message
# Example usage
if __name__ == "__main__":
enhanced_message = enhance_message(customer_message)
program 5:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
import json
import re
# %%
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
# %%
def stem_text(text):
words = nltk.word_tokenize(text)
def lemmatize_text(text):
words = nltk.word_tokenize(text)
# %%
file_path = "dataset.jsonl"
# %%
titles = []
with open(file_path, 'r') as file:
data = json.loads(line)
titles.append(data.get('metadata', {}).get('title'))
# %%
print(titles[:5])
# %%
print(titles[:5])
# %%
print(stem_titles[:5])
print(lemm_titles[:5])
# %%
keyword = "fluid"
keyword = keyword.lower()
stem_matches = []
lemm_matches = []
# Search in stemmed titles
if keyword in stem_title.lower():
stem_matches.append(stem_title)
if keyword in lemm_title.lower():
lemm_matches.append(lemm_title)
# %%
for i in stem_matches:
print(i)
# %%
for j in lemm_matches:
print(j)
Program 6:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
def extract_named_entities(text):
sentences = sent_tokenize(text)
named_entities = []
for sentence in sentences:
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)
chunks = ne_chunk(pos_tags)
for chunk in chunks:
if hasattr(chunk, 'label'):
entity_type = chunk.label()
entity_name = ' '.join([c[0] for c in chunk])
named_entities.append((entity_name, entity_type))
return named_entities
text = """President Joe Biden is scheduled to meet with Ukrainian
President Volodymyr Zelenskyy at the White House on Tuesday.
The meeting comes amid ongoing discussions about military aid and
support for Ukraine's defense against Russian aggression."""
named_entities = extract_named_entities(text)
for entity_name, entity_type in named_entities:
print(f"Entity: {entity_name}, Type: {entity_type}")