C24064 - NLP - Lab Manual
C24064 - NLP - Lab Manual
PRACTICAL 1
TO IMPLEMENT TOKENIZATION OF TEXT.
CODE:
#using nltk
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize
nltk.download("punkt")
nltk.download("punkt_tab")
sentences=sent_tokenize(text)
print("Sentence Tokenization:",sentences)
words=word_tokenize(text)
print("Word Tokenization:",words)
OUTPUT:
1
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
CODE:
#using spacy
import spacy
nlp=spacy.load("en_core_web_sm")
text="आपके विचार आपके जीिन का वनर्ााण करते हैं . यहााँ संग्रह वकये गए र्हान विचारक ं के हजार ं
प्रेरक कथन आपके जीिन र्ें एक सकारात्मक बदलाि ला सकते हैं ."
doc=nlp(text)
tokens=[token.text for token in doc]
print("Spacy Tokenization:",tokens) #word token
OUTPUT:
CODE:
2
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
tokens=re.findall(r'\w+',text)
print("Regular Expression Tokenization:",tokens)
sentences=re.split(r"(?<=[.!?])\s+",text)
print("regex sentence tokenization:",sentences)
OUTPUT:
3
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 2
TO IMPLEMENT STOP WORD REMOVAL.
CODE:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
words=word_tokenize(text)
stop_words=set(stopwords.words('english'))
filtered_word=[word for word in words if word.lower() not in stop_words]
print("Original Text:",words)
print("Filtered Text:",' ',filtered_word)
OUTPUT:
4
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 3
TO IMPLEMENT STEMMING OF TEXT.
CODE:
OUTPUT:
5
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
Snowball stemming
CODE:
#Snowball stemmer
import nltk
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
nltk.download('punkt_tab')
text="Cats are very friendly and loving animals. Dogs are very loyal to humans."
words=word_tokenize(text)
snowball=SnowballStemmer("english")
snowball_stemmed=[snowball.stem(word) for word in words]
print("Original words:",words)
print("Snowball Stemmed:",snowball_stemmed)
OUTPUT:
6
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 4
TO IMPLEMENT LEMMATIZATION OF TEXT.
CODE:
import nltk
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')
lemmatizer = WordNetLemmatizer()
def get_wordnet_pos(word):
"""Map NLTK POS tags to WordNet POS tags."""
tag = pos_tag([word])[0][1][0].upper()
tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R":
wordnet.ADV}
return tag_dict.get(tag, wordnet.NOUN)
sentence = "The striped bats are hanging on their feet for best"
words = word_tokenize(sentence)
lemmatized_words_pos = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in
words]
print("Lemmatized words with POS:", lemmatized_words_pos)
7
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
OUTPUT:
8
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 5
TO IMPLEMENT N-GRAM MODEL
CODE:
tokens=word_tokenize(text)
unigrams=list(ngrams(tokens,1))
tokens=word_tokenize(text)
bigrams=list(ngrams(tokens,2))
tokens=word_tokenize(text)
trigrams=list(ngrams(tokens,3))
print("Original text",text)
print("Unigram Text",unigrams)
print("Bigram Text",bigrams)
print("Trigram Text",trigrams)
OUTPUT:
9
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 6
TO IMPLEMENT POS TAGGING.
CODE:
import spacy
nlp=spacy.load(“en_core_web_sm”)
def pos_tagging_spacy(text):
doc=nlp(text)
return[(token.text,token.pos_)for token in doc]
text=”The quick brown fox jumps over the lazy dog”
pos_tags=pos_tagging_spacy(text)
print(“POS Tags using spacy: “)
print(pos_tags)
OUTPUT:
10
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 7
BUILDING A CUSTOM NER SYSTEM.
CODE:
import spacy
from spacy.training.example import Example
nlp=spacy.blank("en")
ner=nlp.add_pipe("ner",last=True)
ner.add_label("PERSON")
ner.add_label("ORG")
TRAIN_DATA=[
("Bill gates founded Microsoft.",{"entities":[(0,10,"PERSON"),(19,28,"ORG")]}),
("Elon Musk founded Tesla.",{"entities":[(0,10,"PERSON"),(19,24,"ORG")]}),
("Steve jobs created Apple",{"entities":[(0,10,"PERSON"),(19,24,"ORG")]})
]
optimizer= nlp.begin_training()
for i in range (10):
for text, annotations in TRAIN_DATA:
example=Example.from_dict(nlp.make_doc(text),annotations)
nlp.update([example],sgd=optimizer)
for text,annotations in TRAIN_DATA:
doc=nlp.make_doc(text)
tags=spacy.training.offsets_to_biluo_tags(doc,annotations.get("entities"))
print(f"Text: {text}")
print(f"Tags: {tags}")
11
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
OUTPUT:
nlp.to_disk("Custom_ner_model")
print("Training completed and model saved")
import spacy
nlp=spacy.load("Custom_ner_model")
text="Steve Jobs founded Apple"
doc=nlp(text)
for ent in doc.ents:
print(ent.text,ent.label_)
12
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 8
CREATING AND COMPARING DIFFERENT TEXT
REPRESENTATIONS.
BoW Representation
CODE:
#BoW
import nltk
import numpy as np
from collections import Counter
nltk.download('punkt_tab')
texts=[
"The cat sat on the mat",
"The dog sat on the log"
]
tokenized_texts=[nltk.word_tokenize(text.lower()) for text in texts]
vocabulary=set(word for text in tokenized_texts for word in text)
vocabulary_size=len(vocabulary)
print(vocabulary)
def get_bow_representation(tokens,vocabulary):
return [tokens.count(word) for word in vocabulary]
bow_vectors=[get_bow_representation(text,vocabulary) for text in tokenized_texts]
print("BoW vectors:")
print(np.array(bow_vectors))
13
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
OUTPUT:
TF-IDF Representation
CODE:
import nltk
import numpy as np
from collections import Counter
from math import log
nltk.download('punkt_tab')
texts=[
"The cat sat on the mat",
"The dog sat on the log"
]
tokenized_texts=[nltk.word_tokenize(text.lower()) for text in texts]
vocabulary=set(word for text in tokenized_texts for word in text)
print(vocabulary)
def get_tf(tokens,vocabulary):
tf_vector=[tokens.count(word)for word in vocabulary]
print("\n TF vectors:")
print(tf_vector)
return tf_vector
14
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
def get_idf(vocabulary,docs):
num_docs=len(docs)
idf_vector=[]
for word in vocabulary:
num_docs_with_word=sum(1 for doc in docs if word in doc)
idf_value=log(num_docs/(1+num_docs_with_word))+1
idf_vector.append(idf_value)
return idf_vector
def get_tfidf(tokens,vocabulary,idf_vector):
tf_vector=get_tf(tokens,vocabulary)
tfidf_vector=[tf*idf for tf,idf in zip(tf_vector,idf_vector)]
return tfidf_vector
OUTPUT:
15
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
CODE:
import nltk
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt_tab')
texts=[
"The cat sat on the mat.",
"The mat is on the table."
]
tokenized_texts=[nltk.word_tokenize(text.lower()) for text in texts]
vocabulary=set(word for text in tokenized_texts for word in text)
print(vocabulary)
16
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
OUTPUT:
IDF-VECTOR
CODE:
import nltk
import numpy as np
from collections import Counter
from math import log
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt_tab')
texts=[
"The cat sat on the mat.",
"The mat is on the table."
]
tokenized_texts=[nltk.word_tokenize(text.lower()) for text in texts]
vocabulary=set(word for text in tokenized_texts for word in text)
print(vocabulary)
def get_bow_representation(tokens,vocabulary):
return [tokens.count(word)for word in vocabulary]
17
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
def get_tf(tokens,vocabulary):
return [tokens.count(word)for word in vocabulary]
def get_idf(vocabulary,docs):
idf_vector=[]
for word in vocabulary:
num_docs_with_word=sum(1 for doc in docs if word in doc)
idf_value=log(num_docs_with_word/(1+num_docs_with_word))+1
idf_vector.append(idf_value)
return idf_vector
def get_tfidf(tokens,vocabulary,idf_vector):
tf_vector=get_tf(tokens,vocabulary)
tfidf_vector=[tf*idf for tf,idf in zip(tf_vector,idf_vector)]
return tfidf_vector
idf_vector = get_idf(vocabulary, tokenized_texts)
print("\n IDF vector")
print(idf_vector)
tfidf_vectors=[get_tfidf(text,vocabulary,idf_vector) for text in tokenized_texts]
bow_similarity=cosine_similarity([bow_vectors[0]],
[tfidf_vectors[1]])[0][0]
print("Cosine similarity between doc1(Bow) and doc2(TF-IDF):")
print(bow_similarity)
bow_similarity=cosine_similarity([bow_vectors[1]],
[tfidf_vectors[0]])[0][0]
print("Cosine similarity between doc1(Bow) and doc2(TF-IDF):")
print(bow_similarity)
18
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
OUTPUT:
19
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 9
TRAINING AND USING WORD EMBEDDINGS
CODE:
20
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
# Example usage
sentences = [
"The quick brown fox jumps over the lazy dog",
"A fox is a cunning animal",
"The dog barks at night",
"Foxes and dogs are different species"
]
OUTPUT:
21
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 10
IMPLEMENTING A TEXT CLASSIFIER.
CODE:
def train_test_classifier(X,y):
X_train,X_test,y_train,y_test =
train_test_split(X,y,test_size=0.2,random_state=42)
vectorizer=CountVectorizer()
X_train_vectorized=vectorizer.fit_transform(X_train)
X_test_vectorized=vectorizer.transform(X_test)
classifier=MultinomialNB()
classifier.fit(X_train_vectorized,y_train)
y_pred=classifier.predict(X_test_vectorized)
print(classification_report(y_test,y_pred))
return vectorizer,classifier
def classify_text(text,vectorizer,classifier):
text_vectorized=vectorizer.transform([text])
prediction=classifier.predict(text_vectorized)
return prediction[0]
X=[
22
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
OUTPUT:
23
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 11
BUILDING A SENTIMENT ANALYSIS SYSTEM.
CODE:
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
nltk.download('vader_lexicon')
def analyze_sentiment(text):
sia = SentimentIntensityAnalyzer()
sentiment_scores=sia.polarity_scores(text)
if sentiment_scores['compound'] >= 0.1:
sentiment="Positive"
elif sentiment_scores['compound'] <= -0.1:
sentiment="Negative"
else:
sentiment="Neutral"
return sentiment,sentiment_scores
def analyze_sentiments(texts):
results=[]
for text in texts:
sentiment,scores=analyze_sentiment(text)
results.append({
"text":text,
"sentiment":sentiment,
24
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
"pos_score":scores['pos'],
"neg_score":scores['neg'],
"neu_score":scores['neu'],
"compound_score":scores['compound']
})
return pd.DataFrame(results)
texts=[
"My mother cooks a very delicious pizza.",
"My father is not going to his office.",
"I love the gaming laptop my brother bought.",
"This movie is amazing.",
"She hates playing tennis with her classmate.",
"Today is a beautiful day.",
"This movie is horrible."
]
results_df=analyze_sentiments(texts)
print(results_df)
25
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
OUTPUT:
26
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
PRACTICAL 12
CREATING A TEXT SUMMARIZATION TOOL
CODE:
summary = summarize_text(long_text)
print("Original text length:", len(long_text))
print("Summary length:", len(summary))
print("\nSummary:")
print(summary)
27
DES’S NMITD
Roll No.: C24064 NATURAL LANGUAGE PROCESSING MCA SEM II
LAB MANUAL
OUTPUT:
Summary:
Climate change is one of the most pressing issues facing our planet today. It refers to long-
term shifts in temperatures and weather patterns, mainly caused by human activities. These
activities release greenhouse gases into the atmosphere, trapping heat and causing the Earth's
average temperature to rise.
28