0% found this document useful (0 votes)
5 views4 pages

NLP Projects

Uploaded by

Joshua David
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views4 pages

NLP Projects

Uploaded by

Joshua David
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 4

Text Concordance alpha_tokens = [token.

lower() for token in tokens if


token.isalpha()]
import nltk english_words = set(words.words())
from nltk.corpus import gutenberg valid_tokens = [token for token in alpha_tokens if
from nltk.text import Text token in english_words]
corpus=gutenberg.words("shakespeare-macbeth.tx filtered_tokens = [token for token in valid_tokens if
t") token not in stop_words]
text=Text(corpus) stemmer_tokens = [stemmer.stem(token) for token
text.concordance("monstrous") in filtered_tokens] # Corrected variable name

Output print("Original text :", text)


print("Tokenized text :", tokens)
Displaying 1 of 1 matches: print("Filtered text :", filtered_tokens)
Who cannot want the thought , how monstrous It print("Validated text :", valid_tokens)
was for Malcolme , and for Dona. print("Alpha text :", alpha_tokens)
print("Stemmed text :", stemmer_tokens)

Vocabulary Count
Output
import nltk
text=("welcome to the world") Original text : This is a sample text that we used to
words = nltk.word_tokenize(text) demonstrate NLTK text processing 123
num_words=len(words) Tokenized text : ['This', 'is', 'a', 'sample', 'text', 'that',
num_the = words.count('the') 'we', 'used', 'to', 'demonstrate', 'NLTK', 'text',
unique_words=set(words) 'processing', '123']
num_unique_words=len(unique_words) Filtered text : ['sample', 'text', 'used', 'demonstrate',
percen_unique=(num_unique_words/num_words)* 'text']
100 Validated text : ['this', 'is', 'a', 'sample', 'text', 'that',
print(words) 'we', 'used', 'to', 'demonstrate', 'text']
print("the number of words:",num_words) Alpha text : ['this', 'is', 'a', 'sample', 'text', 'that', 'we',
print('number of occurence of "the":',num_the) 'used', 'to', 'demonstrate', 'nltk', 'text', 'processing']
print("number of unique
words:",num_unique_words)
print("percentage of unique
words:",percen_unique)
Bag of Words

from sklearn.feature_extraction.text import


Output
CountVectorizer
['welcome', 'to', 'the', 'world'] corpus = ["This is the first document",
the number of words: 4 "This document is the second document",
number of occurence of "the": 1 "And this is the third one",
number of unique words: 4 "Is this the first document"]
percentage of unique words: 100.0
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

Text Preprocessing for i in range(len(corpus)):


print(f"BoW representation of Document {i+1}:
import nltk {X[i].toarray()[0]}")
nltk.download('stopwords')
nltk.download('words')
from nltk.tokenize import word_tokenize Output
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer BoW representation of
from nltk.corpus import words Document 1: [0 1 1 1 0 0 1 0 1]
BoW representation of
text = 'This is a sample text that we used to Document 2: [0 2 0 1 0 1 1 0 1]
demonstrate NLTK text processing 123' BoW representation of
tokens = word_tokenize(text) Document 3: [1 0 0 1 1 0 1 1 1]
stop_words = set(stopwords.words('english')) BoW representation of
Document 4: [0 1 1 1 0 0 1 0 1]
# Corrected variable name
stemmer = PorterStemmer()
TF-IDF filtered_tokens = [token for token in tokens if
token.lower() not in stop_words]
from nltk.tokenize import word_tokenize lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords lemmatized_tokens =
from nltk.stem import PorterStemmer [lemmatizer.lemmatize(token) for token in
from collections import Counter filtered_tokens]
import math pos_tags = nltk.pos_tag(lemmatized_tokens)
pos_word_corpus = [(word, tag) for word, tag in
def calculate_tf(word, document): pos_tags]
word_frequency = document.count(word)
return word_frequency / len(document) for word, tag in pos_word_corpus:
print(word, ":", tag)
def calculate_idf(word, corpus):
num_documents_containing_word = len([True
for document in corpus if word in document]) Output
if num_documents_containing_word == 0:
return 0 quick : JJ
else: brown : NN
return math.log10(len(corpus) / fox : JJ
num_documents_containing_word) jump : NN
lazy : NN
def calculate_tfidf(document, corpus): dog : NN
PS = PorterStemmer()
stop_words = set(stopwords.words('english'))
words = [PS.stem(word.lower()) for word in Named Entity Recognition
word_tokenize(document) if word.lower() not in
stop_words] import nltk
word_tfidf_values = {} nltk.download('averaged_perceptron_tagger')
for word in words: nltk.download('maxent_ne_chunker')
if word not in word_tfidf_values: nltk.download('words')
tf = calculate_tf(word, words) text="Josh works for Twitter in California."
idf = calculate_idf(word, corpus) tokens=nltk.word_tokenize(text)
word_tfidf_values[word] = tf * idf tagged=nltk.pos_tag(tokens)
return word_tfidf_values entities=nltk.chunk.ne_chunk(tagged)
for entity in entities:
corpus = [ "This is the first document", "This if hasattr(entity,'label'):
document is the second document", "And this is the print(entity.label(),''.join(c[0] for c in
third one", "Is this the first document" ] entity.leaves()))
document = "This is the second document"
tfidf_vector = calculate_tfidf(document, corpus)
print(tfidf_vector) Output

PERSON Josh
Output GPE Twitter
GPE California
{'second': 0.3010299956639812,
'document': 0.06246936830414996}

Pos Tagging

import nltk
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords


from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

line = "quick brown fox jumps over the lazy dog"


tokens = nltk.word_tokenize(line)
stop_words = set(stopwords.words('english'))
Pos Tagging via HMM Chatbot

import nltk import nltk


nltk.download('brown') from nltk.chat.util import Chat,reflections
from nltk.corpus import brown pairs=[[r"Hello|hi|hey|hola",
["Hello,I am Aura,your AI assistant. How may I help
def train_hmm_tagger(): you?"]],
tagged_sentence = [r"How are you|How are you doing",
brown.tagged_sents(categories='news') ["I'm good, how about you?"]],
size = int(len(tagged_sentence) * 0.9) [r"What song always gets you in a good mood?",
trained_sents = tagged_sentence[:size] ['"Happy" by Pharrell Williams never fails to put a
test_sents = tagged_sentence[size:] smile on my face.']],
symbols = set([word for sentence in [r"Suggest a trending song",
tagged_sentence for word, _ in sentence]) ['Good 4 U by Olivia Rodrigo',
states = set([tag for sentence in 'Montero(Call Me By Your Name) by Lil Nas X',
tagged_sentence for _, tag in sentence]) 'Save Your Tears by The Weeknd',
trainer = 'Levitating by Dua Lipa']],
nltk.tag.hmm.HiddenMarkovModelTrainer(states=st [r"quit",["Good bye"]],
ates, symbols=symbols) [r"(.*)",["Could you try again?"]]]
hmm_tagger = bot=Chat(pairs,reflections)
trainer.train_supervised(trained_sents) bot.converse()
return hmm_tagger
Output
def pos_tag_sentence(sentence, hmm_tagger):
tokens = nltk.word_tokenize(sentence) >hi
tagged_tokens = hmm_tagger.tag(tokens) Hello,I am Aura,your AI assistant. How may I help
return tagged_tokens you?
>how are you
hmm_tagger = train_hmm_tagger() I'm good, how about you?
sentence = input("Enter the sentence to be >What song always gets you in a good mood?
tagged?") "Happy" by Pharrell Williams never fails to put a
tagged = pos_tag_sentence(sentence, smile on my face.
hmm_tagger) >Suggest a trending song
print(tagged) Save Your Tears by The Weeknd
>bye
Good bye
Output

Enter the sentence to be tagged?


The sky is so beautiful.

[('The', 'AT'), ('sky', 'NN'), ('is', 'BEZ'), ('so', 'QL'),


('beautiful', 'JJ')]
TEXT CLASSIFICATION USING LOGISTIC TEXT CLASSIFICATION USING NAÏVE
REGRESSION BAYES

from nltk.tokenize import word_tokenize import nltk


from nltk.corpus import stopwords from nltk.corpus import movie_reviews
from nltk.stem import PorterStemmer from nltk.corpus import stopwords
from sklearn.feature_extraction.text import from nltk.classify import NaiveBayesClassifier
CountVectorizer from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import from sklearn.model_selection import train_test_split
LogisticRegression nltk.download('movie_reviews')
def preprocess(text): nltk.download('stopwords')
ps = PorterStemmer() stop_words = set(stopwords.words('english'))
stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer()
words = [word_tokenize(sentence) for sentence
in text] def preprocess(text):
filtered_words = [[ps.stem(word) for word in tokens = word_tokenize(text.lower())
tokenized if word not in stop_words and tokens = [lemmatizer.lemmatize(token) for token
word.isalpha()] for in tokens if token not in stop_words and
tokenized in words] token.isalpha()]
filtered_sentences = [' '.join(sentence) for return dict(nltk.FreqDist(tokens))
sentence in filtered_words] pos_reviews = [(movie_reviews.raw(fileid),
return filtered_sentences 'positive') for fileid in movie_reviews.fileids('pos')]
neg_reviews = [(movie_reviews.raw(fileid),
sentences = ["The food is tasty", "the quality of 'negative') for fileid in movie_reviews.fileids('neg')]
food is low", "i will never recommend their food", tot_rev = pos_reviews + neg_reviews
"I got sick after having their food", "I was processed_data = [(preprocess(text), category) for
in cloudnine after tasting their food", (text, category) in tot_rev]
"My favourite is their desserts", "the food train_data, val_data =
was not cooked properly"] train_test_split(processed_data, test_size=0.2,
classes = [1, 0, 0, 0, 1, 1, 0] random_state=42)
test_sentences = ["food is not cooked properly", "I
feel sick after having food", "I love their desserts", classifier = NaiveBayesClassifier.train(train_data)
"was in cloudnine after tasting their food"] new_text = ["The movie was amazing", "the movie
was terrible", "The movie was awful"]
vectorizer = CountVectorizer() for text in new_text:
sentences = preprocess(sentences) new_features = preprocess(text)
vect1 = vectorizer.fit_transform(sentences) predicted_category =
# Splitting data for testing classifier.classify(new_features)
# train_data, test_data, train_labels, test_labels = print(f"The predicted category for '{text}' is
train_test_split(vect1, classes, test_size=0.2, '{predicted_category}'")
random_state=42
nb = LogisticRegression() Output
nb.fit(vect1, classes)
test_sentences = preprocess(test_sentences) The predicted category for 'The movie was
vect2 = vectorizer.transform(test_sentences) amazing' is 'positive'
pred_classes = nb.predict(vect2) The predicted category for 'the movie was terrible'
print(pred_classes) is 'negative'
The predicted category for 'The movie was awful' is
Output 'negative’

[0 0 1 1]

You might also like