0% found this document useful (0 votes)
32 views

Final_NLP_Lab_File

Uploaded by

Kartik Chahar
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
32 views

Final_NLP_Lab_File

Uploaded by

Kartik Chahar
Copyright
© © All Rights Reserved
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 28

SESSION - (2024-25)

NATURAL LANGUAGE PROCESSING

LAB MANUAL

DEPARTMENT OF COMPUTER ENGINEERING AND APPLICATIONS

(CSE-AIML)

GLA. University

Submitted By- Submitted To-

Lucky Goyal Mr.Ankur Mishra


Week -1

Aim: a) Write a python program to perform tokenization by word


and sentence using nltk.

# Import the necessary modules from nltk


import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Sample text for tokenization


text = """Hello! This is an example sentence. Tokenization splits text
into smaller parts.
It can split by sentences or by words."""

# Download the necessary resources


nltk.download('punkt')

# Sentence tokenization
sentences = sent_tokenize(text)
print("Sentence Tokenization:")
print(sentences)

# Word tokenization
words = word_tokenize(text)
print("\nWord Tokenization:")
print(words)

Output -
b) Write a python program to eliminate stopwords using nltk

# Import the necessary modules from nltk


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sample text for stopword removal


text = "This is an example sentence showing how to remove stopwords
using nltk."

# Download the necessary resources


nltk.download('punkt')
nltk.download('stopwords')

# Define the English stopwords


stop_words = set(stopwords.words('english'))

# Tokenize the text into words


words = word_tokenize(text)

# Remove stopwords from the tokenized words


filtered_words = [word for word in words if word.lower() not in
stop_words]

print("Original Words:", words)


print("Filtered Words (without stopwords):", filtered_words)
Output-

c.) Write a python program to perform stemming using nltk

# Import the necessary modules from nltk


import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Sample text for stemming


text = "The leaves on the tree are falling, and the wind is blowing
strongly."

# Download the necessary resources


nltk.download('punkt')

# Initialize the Porter Stemmer


stemmer = PorterStemmer()

# Tokenize the text into words


words = word_tokenize(text)

# Apply stemming to each word


stemmed_words = [stemmer.stem(word) for word in words]

print("Original Words:", words)


print("Stemmed Words:", stemmed_words)

Output-

Week-2

a) Write a python program to perform Parts of Speech tagging


using nltk

# Import the necessary modules from nltk


import nltk
from nltk.tokenize import word_tokenize

# Sample text for POS tagging


text = "The quick brown fox jumps over the lazy dog."

# Download the necessary resources


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Tokenize the text into words


words = word_tokenize(text)

# Perform POS tagging


pos_tags = nltk.pos_tag(words)

print("Word and POS Tags:")


for word, tag in pos_tags:
print(f"{word}: {tag}")

Output-

b) Write a python program to perform lemmatization using nltk.

# Import the necessary modules from nltk


import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Sample text for lemmatization


text = "The striped bats are hanging on their feet for best."

# Download the necessary resources


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize the WordNet Lemmatizer


lemmatizer = WordNetLemmatizer()

# Tokenize the text into words


words = word_tokenize(text)

# Apply lemmatization to each word


lemmatized_words = [lemmatizer.lemmatize(word) for word in words]

print("Original Words:", words)


print("Lemmatized Words:", lemmatized_words)

Output-

Week - 3

a) Write a python program for chunking using nltk.

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
def chunk_sentence(sentence):
"""Chunks a given sentence using NLTK's RegexpParser.

Args:
sentence: The sentence to be chunked.

Returns:
A list of chunked sentences.
"""

# Tokenize the sentence into words


words = word_tokenize(sentence)

# Perform POS tagging


pos_tags = pos_tag(words)

# Define chunk grammar rules


chunk_grammar = r"""
NP: {<DT>?<JJ>*<NN>+} # Noun Phrase
VP: {<VB.*>} # Verb Phrase
PP: {<IN> <NP>} # Prepositional Phrase
"""

# Create a chunk parser


chunk_parser = nltk.RegexpParser(chunk_grammar)

# Parse the sentence


chunked_sentence = chunk_parser.parse(pos_tags)

return chunked_sentence

# Example usage
sentence = "The quick brown fox jumps over the lazy dog."
chunked_sentence = chunk_sentence(sentence)

print(chunked_sentence)

Output-
B) Write a python program to perform Named Entity
Recognition using nltk

# program for NER


import spacy
from spacy import displacy

# Load the English language model


nlp = spacy.load("en_core_web_sm")

# Define your text


text = "Apple Inc. was founded by Steve Jobs and Steve Wozniak in California on April 1, 1976."

# Process the text with spaCy


doc = nlp(text)

displacy.render(doc, style="ent")

Output -
Week-4

a) Write a python program to find Term Frequency and Inverse


Document Frequency (TF-IDF)

# code to calculate TF
def compute_tf(doc):
tf = {}
total_words = len(doc)
for word in doc:
word_lower = word.lower()
if word_lower in tf:
tf[word_lower] += 1
else:
tf[word_lower] = 1
# Normalize by total number of words in the document
for word in tf:
tf[word] /= total_words
return tf

# Code to calculate IDF


def compute_idf(corpus):
idf = {}
total_docs = len(corpus)
# Create a set of all words that appear in at least one document
word_doc_count = {}
for doc in corpus:
words_in_doc = set([word.lower() for word in doc])
for word in words_in_doc:
if word not in word_doc_count:
word_doc_count[word] = 1
else:
word_doc_count[word] += 1
# Calculate IDF
for word, count in word_doc_count.items():
idf[word] = math.log(total_docs / (1 + count)) # Adding 1 to avoid division by zero
return idf

# Code to calculate TF-IDF


def compute_tfidf(doc, tf, idf):
tfidf = {}
for word in tf:
tfidf[word] = tf[word] * idf.get(word, 0) # Use 0 for words not in IDF dictionary
return tfidf
# Example usage
import math

# Sample corpus (list of documents)


corpus = [
["the", "sky", "is", "blue"],
["the", "sun", "is", "bright"],
["the", "sun", "in", "the", "sky", "is", "bright"],
["we", "can", "see", "the", "shining", "sun", "the", "bright", "sun"]
]

# Step 1: Compute TF for each document


tfs = [compute_tf(doc) for doc in corpus]

# Step 2: Compute IDF using the entire corpus


idf = compute_idf(corpus)

# Step 3: Compute TF-IDF for each document


tfidfs = [compute_tfidf(doc, tf, idf) for doc, tf in zip(corpus, tfs)]

# Output results
print("TF for each document:")
for i, tf in enumerate(tfs):
print(f"Document {i+1} TF: {tf}")

print("\nIDF for the corpus:")


print(idf)

print("\nTF-IDF for each document:")


for i, tfidf in enumerate(tfidfs):
print(f"Document {i+1} TF-IDF: {tfidf}")

Output -
b) Write a python program for CYK parsing (Cocke-Younger-Kasami
Parsing) or Chart Parsing.

import nltk
from nltk import CFG

# Define a simple context-free grammar (CFG)


grammar = CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> Det N
V -> "saw" | "ate"
Det -> "a" | "an" | "the"
N -> "dog" | "cat" | "man"
""")

# Define the CYK parser function


def cyk_parse(grammar, sentence):
# Tokenize the sentence
sentence = sentence.split()

# Initialize the chart (a list of lists of sets)


n = len(sentence)
chart = [[set() for _ in range(n)] for _ in range(n)]

# Fill the chart with terminal symbols (words in the sentence)


for j in range(n):
for production in grammar.productions(rhs=sentence[j]):
chart[j][j].add(production.lhs())

# Fill the chart with non-terminal symbols for substrings of length


> 1
for length in range(2, n + 1): # length of the span (from 2 to n)
for i in range(n - length + 1): # starting point of the span
j = i + length - 1 # end point of the span
for k in range(i, j): # split point of the span
# Find all productions that could generate the span
for production in grammar.productions():
if production.rhs()[0] in chart[i][k] and
production.rhs()[1] in chart[k + 1][j]:
chart[i][j].add(production.lhs())

# Check if the start symbol can generate the entire sentence


return 'S' in chart[0][n - 1]

# Example sentence to parse


sentence = "the cat saw a dog"

# Perform CYK parsing


if cyk_parse(grammar, sentence):
print("The sentence can be generated by the grammar.")
else:
print("The sentence cannot be generated by the grammar.")

Output-

Week-5

a) Write a python program to find all unigrams, bigrams and


trigrams present in the given corpus.

import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
# Sample text (corpus)
corpus = "This is a simple example sentence for extracting unigrams,
bigrams, and trigrams."

# Tokenize the corpus into words


words = word_tokenize(corpus)

# Find unigrams (single words)


unigrams = list(ngrams(words, 1))

# Find bigrams (pairs of words)


bigrams = list(ngrams(words, 2))

# Find trigrams (triplets of words)


trigrams = list(ngrams(words, 3))

# Print the results


print("Unigrams:")
print(unigrams)

print("\nBigrams:")
print(bigrams)

print("\nTrigrams:")
print(trigrams)

Output-

b)Write a python program to find the probability of the given


statement “This is my
cat” by taking the an exmple corpus into consideration.
'This is a
dog’, 'This is
a cat', 'I love
my cat',
'This is my name’

import nltk
from nltk import FreqDist
from nltk.util import bigrams, ngrams
from nltk.tokenize import word_tokenize

# Sample corpus
corpus = """
This is my cat. My cat is black. The cat is playing. This is a simple
sentence.
"""

# Tokenize the corpus into words


words = word_tokenize(corpus.lower()) # Lowercasing to make it
case-insensitive

# Unigram Model: Calculate the frequency distribution of words in the


corpus
unigram_freq = FreqDist(words)

# Bigram Model: Generate bigrams from the tokenized words


bigram_list = list(bigrams(words))
bigram_freq = FreqDist(bigram_list)

# Sentence for which we want to calculate the probability


sentence = "This is my cat"
sentence_tokens = word_tokenize(sentence.lower())

# Calculate Unigram Probability


def unigram_probability(sentence_tokens, unigram_freq):
total_words = sum(unigram_freq.values()) # Total number of words
in the corpus
prob = 1.0
for word in sentence_tokens:
prob *= unigram_freq[word] / total_words
return prob

# Calculate Bigram Probability


def bigram_probability(sentence_tokens, bigram_freq, unigram_freq):
prob = unigram_freq[sentence_tokens[0]] /
sum(unigram_freq.values()) # P(w1)
for i in range(1, len(sentence_tokens)):
prob *= bigram_freq[(sentence_tokens[i-1], sentence_tokens[i])]
/ unigram_freq[sentence_tokens[i-1]]
return prob

# Calculate probabilities for the given sentence


unigram_prob = unigram_probability(sentence_tokens, unigram_freq)
bigram_prob = bigram_probability(sentence_tokens, bigram_freq,
unigram_freq)

# Print the results


print(f"Unigram Probability of '{sentence}': {unigram_prob}")
print(f"Bigram Probability of '{sentence}': {bigram_prob}")

Output-

Week-6

Use the Stanford named Entity recognizer to extract entities from


the documents. Use it programmatically and output for each
document which named entities it contains and of Which type.

pip install stanza

import stanza

# Initialize the Stanford NER using stanza (which wraps Stanford's NER
models)
stanza.download('en') # This will download the English model
nlp = stanza.Pipeline('en', processors='tokenize,ner')

# Example list of documents


documents = [
"Barack Obama was born in Hawaii. He was the 44th president of the
United States.",
"Apple Inc. is looking to expand its business in Europe. Tim Cook
is the CEO of Apple.",
"Elon Musk, the CEO of Tesla, plans to send humans to Mars by
2024."
]

# Function to extract Named Entities from a document


def extract_named_entities(doc):
# Process the document through the NER pipeline
doc = nlp(doc)

# Extract and print named entities and their types


entities = []
for ent in doc.ents:
entities.append((ent.text, ent.type))
return entities

# Iterate over the documents and extract entities


for i, doc in enumerate(documents):
print(f"Document {i+1}:")
entities = extract_named_entities(doc)

if entities:
print("Named Entities and their Types:")
for entity in entities:
print(f"Entity: {entity[0]}, Type: {entity[1]}")
else:
print("No named entities found.")
print()

Output-
Week-7

Choose any corpus available on the internet freely. For the corpus,
for each document, count how many times each stop word occurs
and find out which are the most frequently occurring stop words.
Further, calculate the term frequency and inverse document
frequency as The motivation behind this is basically to find out how
important a document is to a given query. For e.g.: If the query is
say: “The brown crow”. “The” is less important. “Brown” and
“crow” are relatively more important. Since “the” is a more
common word, its tf will be high. Hence we multiply it by idf, by
knowing how common it is to reduce its weight.

import nltk
from nltk.corpus import stopwords
from nltk.corpus import reuters
from nltk.probability import FreqDist
from nltk.text import TextCollection
import math

# Download required NLTK resources


nltk.download('stopwords')
nltk.download('reuters')

# Load the stop words


stop_words = set(stopwords.words('english'))
# Load the Reuters corpus
documents = reuters.sents()

# 1. Count how many times each stop word occurs in the documents
stop_word_counts = {}

for doc in documents:


for word in doc:
word_lower = word.lower()
if word_lower in stop_words:
if word_lower not in stop_word_counts:
stop_word_counts[word_lower] = 1
else:
stop_word_counts[word_lower] += 1

# Sort stop words by frequency


sorted_stop_word_counts = sorted(stop_word_counts.items(), key=lambda
x: x[1], reverse=True)

# Print most frequent stop words


print("Most Frequent Stop Words:")
for word, count in sorted_stop_word_counts[:10]:
print(f"{word}: {count}")

# 2. Calculate Term Frequency (TF)


def compute_tf(doc):
tf = {}
total_words = len(doc)
for word in doc:
word_lower = word.lower()
if word_lower in tf:
tf[word_lower] += 1
else:
tf[word_lower] = 1
# Normalize by total number of words in the document
for word in tf:
tf[word] /= total_words
return tf

# 3. Calculate Inverse Document Frequency (IDF)


def compute_idf(corpus):
idf = {}
total_docs = len(corpus)
# Create a set of all words that appear in at least one document
word_doc_count = {}
for doc in corpus:
words_in_doc = set([word.lower() for word in doc])
for word in words_in_doc:
if word not in word_doc_count:
word_doc_count[word] = 1
else:
word_doc_count[word] += 1

# Calculate IDF for each word


for word, doc_count in word_doc_count.items():
idf[word] = math.log(total_docs / (1 + doc_count)) # Smoothing
with +1

return idf

# 4. Calculate TF-IDF
def compute_tfidf(doc, tf, idf):
tfidf = {}
for word in tf:
tfidf[word] = tf[word] * idf.get(word, 0) # Use 0 for words
not in IDF dictionary
return tfidf

# Select a few sample documents


sample_docs = documents[:5] # Use the first 5 documents as samples for
analysis

# Compute TF, IDF, and TF-IDF for the sample documents


corpus = documents # Full corpus for IDF computation
idf = compute_idf(corpus)

for idx, doc in enumerate(sample_docs):


print(f"\nDocument {idx + 1} TF-IDF:")
tf = compute_tf(doc)
tfidf = compute_tfidf(doc, tf, idf)

# Sort TF-IDF values by their score


sorted_tfidf = sorted(tfidf.items(), key=lambda x: x[1],
reverse=True)
# Print the top 10 TF-IDF scores for the document
for word, score in sorted_tfidf[:10]:
print(f"{word}: {score:.5f}")

Output-

Week-8

a. Write the python code to perform sentiment analysis using NLP

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Download the VADER lexicon


nltk.download('vader_lexicon')

# Initialize the SentimentIntensityAnalyzer


sia = SentimentIntensityAnalyzer()

# Example texts
texts = [
"I love this product! It's amazing and works perfectly.",
"This is the worst experience I've ever had. So disappointed.",
"I'm not sure how I feel about this. It's okay, I guess.",
"Fantastic! I will definitely buy this again. Highly recommend!",
"The movie was a bit long and boring. Could have been better."
]

# Perform sentiment analysis for each text


for text in texts:
# Get the sentiment scores
sentiment_score = sia.polarity_scores(text)

# Determine the sentiment


if sentiment_score['compound'] >= 0.05:
sentiment = 'Positive'
elif sentiment_score['compound'] <= -0.05:
sentiment = 'Negative'
else:
sentiment = 'Neutral'

# Print the result


print(f"Text: {text}")
print(f"Sentiment: {sentiment}")
print(f"Sentiment Scores: {sentiment_score}")
print()

Output-

Week-9

1. Write the python code to develop Spam Filter using NLP


import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import string

# Download necessary resources


nltk.download('punkt')
nltk.download('stopwords')

# Sample Data - You can replace this with a dataset like 'SMS Spam
Collection Dataset'
# A list of (message, label) tuples
data = [
("Hey, how are you?", "ham"),
("Free cash prize, claim now!", "spam"),
("Call me when you get this message", "ham"),
("Limited time offer, win a lottery!", "spam"),
("Let's meet tomorrow", "ham"),
("Congratulations, you've won a free ticket", "spam"),
("Are we still meeting at 5?", "ham"),
("Earn money from home. Apply now", "spam"),
]

# Step 1: Preprocessing the text (lowercasing, removing punctuation,


stopwords, etc.)
def preprocess_text(text):
text = text.lower() # Convert to lowercase
text = ''.join([char for char in text if char not in
string.punctuation]) # Remove punctuation
words = word_tokenize(text) # Tokenize the text into words
words = [word for word in words if word not in
stopwords.words('english')] # Remove stop words
return ' '.join(words)

# Preprocess the data


messages, labels = zip(*data)
messages = [preprocess_text(msg) for msg in messages]
# Step 2: Convert text data into numerical features using
CountVectorizer (Bag of Words)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(messages)

# Step 3: Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, labels,
test_size=0.3, random_state=42)

# Step 4: Train a Naive Bayes Classifier


model = MultinomialNB()
model.fit(X_train, y_train)

# Step 5: Predict the labels for the test set


y_pred = model.predict(X_test)

# Step 6: Evaluate the model


print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Example usage: Predict if a new message is spam or ham


new_message = "You have won a free gift, claim it now!"
preprocessed_message = preprocess_text(new_message)
vectorized_message = vectorizer.transform([preprocessed_message])
prediction = model.predict(vectorized_message)

print(f"Message: '{new_message}'")
print(f"Prediction: {prediction[0]}") # spam or ham

Output-
Week-10

1. Write the python code to detect Fake News using NLP

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import string
import nltk

# Download necessary resources


nltk.download('stopwords')
from nltk.corpus import stopwords

# Sample Fake News Dataset (Replace this with a real dataset such as
"fake_news.csv")
# For this example, the data is structured as 'text' and 'label'
columns
data = {
'text': [
"Breaking: Scientists have discovered a new planet.",
"New study finds that vaccines cause autism.",
"Global warming is accelerating at an unprecedented rate.",
"Aliens have made contact with Earth, according to new
reports.",
"The government announces new tax relief measures for small
businesses."
],
'label': ['real', 'fake', 'real', 'fake', 'real']
}

# Create DataFrame
df = pd.DataFrame(data)

# Step 1: Preprocessing the text (lowercasing, removing punctuation,


stopwords, etc.)
def preprocess_text(text):
# Lowercase the text
text = text.lower()
# Remove punctuation
text = ''.join([char for char in text if char not in
string.punctuation])

# Tokenize and remove stopwords


stop_words = set(stopwords.words('english'))
words = text.split()
words = [word for word in words if word not in stop_words]

return ' '.join(words)

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)

# Step 2: Convert text data into numerical features using TF-IDF


Vectorization
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['text'])

# Step 3: Convert labels to binary values (real -> 0, fake -> 1)


y = df['label'].map({'real': 0, 'fake': 1})

# Step 4: Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3, random_state=42)

# Step 5: Train a Logistic Regression classifier


model = LogisticRegression()
model.fit(X_train, y_train)

# Step 6: Predict the labels for the test set


y_pred = model.predict(X_test)

# Step 7: Evaluate the model


print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Example usage: Predict if a new news article is fake or real


new_article = "A new study reveals that eating chocolate improves brain
function!"
preprocessed_article = preprocess_text(new_article)
vectorized_article = vectorizer.transform([preprocessed_article])
prediction = model.predict(vectorized_article)

print(f"Article: '{new_article}'")
print(f"Prediction: {'fake' if prediction[0] == 1 else 'real'}")

Output-

You might also like