Final_NLP_Lab_File
Final_NLP_Lab_File
LAB MANUAL
(CSE-AIML)
GLA. University
# Sentence tokenization
sentences = sent_tokenize(text)
print("Sentence Tokenization:")
print(sentences)
# Word tokenization
words = word_tokenize(text)
print("\nWord Tokenization:")
print(words)
Output -
b) Write a python program to eliminate stopwords using nltk
Output-
Week-2
Output-
Output-
Week - 3
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
def chunk_sentence(sentence):
"""Chunks a given sentence using NLTK's RegexpParser.
Args:
sentence: The sentence to be chunked.
Returns:
A list of chunked sentences.
"""
return chunked_sentence
# Example usage
sentence = "The quick brown fox jumps over the lazy dog."
chunked_sentence = chunk_sentence(sentence)
print(chunked_sentence)
Output-
B) Write a python program to perform Named Entity
Recognition using nltk
displacy.render(doc, style="ent")
Output -
Week-4
# code to calculate TF
def compute_tf(doc):
tf = {}
total_words = len(doc)
for word in doc:
word_lower = word.lower()
if word_lower in tf:
tf[word_lower] += 1
else:
tf[word_lower] = 1
# Normalize by total number of words in the document
for word in tf:
tf[word] /= total_words
return tf
# Output results
print("TF for each document:")
for i, tf in enumerate(tfs):
print(f"Document {i+1} TF: {tf}")
Output -
b) Write a python program for CYK parsing (Cocke-Younger-Kasami
Parsing) or Chart Parsing.
import nltk
from nltk import CFG
Output-
Week-5
import nltk
from nltk.util import ngrams
from nltk.tokenize import word_tokenize
# Sample text (corpus)
corpus = "This is a simple example sentence for extracting unigrams,
bigrams, and trigrams."
print("\nBigrams:")
print(bigrams)
print("\nTrigrams:")
print(trigrams)
Output-
import nltk
from nltk import FreqDist
from nltk.util import bigrams, ngrams
from nltk.tokenize import word_tokenize
# Sample corpus
corpus = """
This is my cat. My cat is black. The cat is playing. This is a simple
sentence.
"""
Output-
Week-6
import stanza
# Initialize the Stanford NER using stanza (which wraps Stanford's NER
models)
stanza.download('en') # This will download the English model
nlp = stanza.Pipeline('en', processors='tokenize,ner')
if entities:
print("Named Entities and their Types:")
for entity in entities:
print(f"Entity: {entity[0]}, Type: {entity[1]}")
else:
print("No named entities found.")
print()
Output-
Week-7
Choose any corpus available on the internet freely. For the corpus,
for each document, count how many times each stop word occurs
and find out which are the most frequently occurring stop words.
Further, calculate the term frequency and inverse document
frequency as The motivation behind this is basically to find out how
important a document is to a given query. For e.g.: If the query is
say: “The brown crow”. “The” is less important. “Brown” and
“crow” are relatively more important. Since “the” is a more
common word, its tf will be high. Hence we multiply it by idf, by
knowing how common it is to reduce its weight.
import nltk
from nltk.corpus import stopwords
from nltk.corpus import reuters
from nltk.probability import FreqDist
from nltk.text import TextCollection
import math
# 1. Count how many times each stop word occurs in the documents
stop_word_counts = {}
return idf
# 4. Calculate TF-IDF
def compute_tfidf(doc, tf, idf):
tfidf = {}
for word in tf:
tfidf[word] = tf[word] * idf.get(word, 0) # Use 0 for words
not in IDF dictionary
return tfidf
Output-
Week-8
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Example texts
texts = [
"I love this product! It's amazing and works perfectly.",
"This is the worst experience I've ever had. So disappointed.",
"I'm not sure how I feel about this. It's okay, I guess.",
"Fantastic! I will definitely buy this again. Highly recommend!",
"The movie was a bit long and boring. Could have been better."
]
Output-
Week-9
# Sample Data - You can replace this with a dataset like 'SMS Spam
Collection Dataset'
# A list of (message, label) tuples
data = [
("Hey, how are you?", "ham"),
("Free cash prize, claim now!", "spam"),
("Call me when you get this message", "ham"),
("Limited time offer, win a lottery!", "spam"),
("Let's meet tomorrow", "ham"),
("Congratulations, you've won a free ticket", "spam"),
("Are we still meeting at 5?", "ham"),
("Earn money from home. Apply now", "spam"),
]
print(f"Message: '{new_message}'")
print(f"Prediction: {prediction[0]}") # spam or ham
Output-
Week-10
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import string
import nltk
# Sample Fake News Dataset (Replace this with a real dataset such as
"fake_news.csv")
# For this example, the data is structured as 'text' and 'label'
columns
data = {
'text': [
"Breaking: Scientists have discovered a new planet.",
"New study finds that vaccines cause autism.",
"Global warming is accelerating at an unprecedented rate.",
"Aliens have made contact with Earth, according to new
reports.",
"The government announces new tax relief measures for small
businesses."
],
'label': ['real', 'fake', 'real', 'fake', 'real']
}
# Create DataFrame
df = pd.DataFrame(data)
# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)
print(f"Article: '{new_article}'")
print(f"Prediction: {'fake' if prediction[0] == 1 else 'real'}")
Output-