NLP Lab
NLP Lab
import re
data="The biggest 5 Animals are 1.Elephant 2.Girrafe 3.Tiger 4.Lion 5.Cheetah"
result=re.sub("\d+",'',data)
print(result)
def punctuations(data):
text=data
text=text.replace("'nt"," not")
text=text.replace("'s"," is")
text=text.replace("'re"," are")
text=text.replace("'ll"," will")
return text
s="How's my team doin,you're supposed to be not losing"
returned_data=punctuations(s)
print(returned_data)
Tokenization, Stemming
import nltk
nltk.download()
import nltk
data="Welcome to TIMSCDR!!"
tokens=nltk.sent_tokenize(data)
print(tokens)
tokens=nltk.word_tokenize(data)
print(tokens)
print(port_stemmer.stem("Liked"))
data=["liked","liking","likes","killing","killed"]
for words in data:
print(words," :",port_stemmer.stem(words))
print("Socks :",lemmati.lemmatize("socks"))
print("corpora :",lemmati.lemmatize("corpora"))
print("better :",lemmati.lemmatize("better",pos="a"))
Removal of Stopwords
data="All work and no play.All work and no play makes jack a dull boy"
tokens=nltk.word_tokenize(data)
filtered_Data=[]
for w in tokens:
if w not in stopWords:
filtered_Data.append(w)
print(filtered_Data)
Implementation of POS Tag
1.
import spacy
nlp=spacy.load("en_core_web_sm")
POS_count= doc.count_by(spacy.attrs.POS)
print(POS_count)
2.
import spacy
from spacy import displacy
nlp=spacy.load("en_core_web_sm")
doc=nlp("This is my School")
options={'color':'red','bg':'blue','compact':'True','distance':100}
displacy.render(doc,style="dep",options=options)
3.
import nltk
from nltk.tag import DefaultTagger
tokens=nltk.word_tokenize(text)
tagging=DefaultTagger("Ad")
print(tagging.tag(tokens))
4.
import nltk
import nltk
sentence= "the little yellow dog barked at the car"
#Tokenization
tokens= nltk.word_tokenize(sentence)
#POs Tagging
tag=nltk.pos_tag(tokens)
cp=nltk.RegexpParser(phrase)
result=cp.parse(tag)
print(result)
TF-IDF implementation
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
text = ["I love writing code in Python. I love Python code",
"I hate writing code in Java. I hate Java code"]
df = pd.DataFrame({'review':['review1','review2'],'text':text})
cv = CountVectorizer(stop_words='english')
cv_matrix = cv.fit_transform(df['text'])
df_dtm=pd.DataFrame(cv_matrix.toarray(),index=df['review'].values,
columns=cv.get_feature_names_out())
df_dtm
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
text = ["I love writing code in python.I love python code",
"I hate writing code in java.i hate java code"]
df = pd.DataFrame({'review':['review1','review2'],'text':text})
tfidf = TfidfVectorizer(stop_words='english',norm=None)
tfidf_matrix = tfidf.fit_transform(df['text'])
output = pd.DataFrame(tfidf_matrix.toarray(),index=df['review'],
columns=tfidf.get_feature_names_out())
output
Practical 4: Creating and comparing different text representations
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
def compare_text_representations(texts):
# Bag of Words (BoW) representation
bow_vectorizer = CountVectorizer()
bow_matrix = bow_vectorizer.fit_transform(texts)
# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
# Compare similarities
bow_similarity = cosine_similarity(bow_matrix[0], bow_matrix[1])
tfidf_similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
print("BoW vectors:")
print(bow_matrix.toarray())
print("\nTF-IDF vectors:")
print(tfidf_matrix.toarray())
print(f"\nBoW Cosine Similarity: {bow_similarity[0][0]:.4f}")
print(f"TF-IDF Cosine Similarity: {tfidf_similarity[0][0]:.4f}")
# Example usage
texts = [
"The cat sat on the mat",
"The dog sat on the log"
]
compare_text_representations(texts)
Training and using word embeddings
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
def train_word_embeddings(sentences):
# Tokenize sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
return model
# Example usage
sentences = [
"The quick brown fox jumps over the lazy dog",
"A fox is a cunning animal",
"The dog barks at night",
"Foxes and dogs are different species"
]
model = train_word_embeddings(sentences)
use_word_embeddings(model, "fox")
Implement N gram Language model
1.
import nltk
from nltk.util import ngrams
Ngram= ngrams(sequence=nltk.word_tokenize(text),n=3)
nltk.download('punkt')
if n == 2:
pairs = list(bigrams(words))
elif n == 3:
pairs = list(trigrams(words))
else:
raise ValueError("n must be 2 or 3")
return model
words.append(next_word)
# Example usage
text = """
The cat sat on the mat. The dog ran in the park.
Cats like to play with toys. Dogs enjoy chasing balls.
"""
bigram_model = build_language_model(text, n=2)
trigram_model = build_language_model(text, n=3)
df.head()
#Rename columns
df.columns=['Sentiment','Text']
#encode label
le= LabelEncoder()
df['Sentiment']=le.fit_transform(df['Sentiment'])
df.head()
X=list(df['Text'])
y=list(df['Sentiment'])
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)
y_pred=clf.predict(X_test_cv)
score=f1_score(y_test,y_pred, average="micro")
print('F-1 score : {}'.format(np.round(score,4)))
for i in range(1,11):
cv=CountVectorizer(analyzer='word',ngram_range=(1,i),stop_words='english')
#creating BOG
X_train_cv=cv.fit_transform(X_train)
X_test_cv=cv.transform(X_test)
#training of classifiers
clf2=MultinomialNB()
clf2.fit(X_train_cv,y_train)
y_pred=clf2.predict(X_test_cv)
score=f1_score(y_test,y_pred,average="micro")
print('F1-Score: {}'.format(np.round(score,4)))
Implementing a text classifier
# Create a CountVectorizer
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)
# Make predictions
y_pred = classifier.predict(X_test_vectorized)
print(classification_report(y_test, y_pred))
text_vectorized = vectorizer.transform([text])
prediction = classifier.predict(text_vectorized)
return prediction[0]
# Example usage
X=[
import nltk
import pandas as pd
nltk.download('vader_lexicon')
def analyze_sentiment(text):
sia = SentimentIntensityAnalyzer()
sentiment_scores = sia.polarity_scores(text)
sentiment = "Positive"
sentiment = "Negative"
else:
sentiment = "Neutral"
def analyze_sentiments(texts):
results = []
results.append({
'text': text,
'sentiment': sentiment,
'pos_score': scores['pos'],
'neg_score': scores['neg'],
'neu_score': scores['neu'],
'compound_score': scores['compound']
})
return pd.DataFrame(results)
# Example usage
texts = [
results_df = analyze_sentiments(texts)
print(results_df)
return summary[0]['summary_text']
# Example usage
long_text = """
Climate change is one of the most pressing issues facing our planet today. It refers
to long-term shifts in temperatures and weather patterns, mainly caused by human
activities, especially the burning of fossil fuels. These activities release greenhouse
gases into the atmosphere, trapping heat and causing the Earth's average
temperature to rise. The consequences of climate change are far-reaching and
include more frequent and severe weather events, rising sea levels, and disruptions
to ecosystems. To address this global challenge, countries and organizations
worldwide are working on strategies to reduce greenhouse gas emissions and
transition to cleaner energy sources. Individual actions, such as reducing energy
consumption and adopting sustainable practices, also play a crucial role in
mitigating the effects of climate change.
"""
summary = summarize_text(long_text)
print("\nSummary:")
print(summary)