0% found this document useful (0 votes)

5 views4 pages

NLP Projects

Uploaded by

Joshua David

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

5 views4 pages

NLP Projects

Uploaded by

Joshua David

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 4

Text Concordance alpha_tokens = [token.

lower() for token in tokens if

token.isalpha()]
import nltk english_words = set(words.words())
from nltk.corpus import gutenberg valid_tokens = [token for token in alpha_tokens if
from nltk.text import Text token in english_words]
corpus=gutenberg.words("shakespeare-macbeth.tx filtered_tokens = [token for token in valid_tokens if
t") token not in stop_words]
text=Text(corpus) stemmer_tokens = [stemmer.stem(token) for token
text.concordance("monstrous") in filtered_tokens] # Corrected variable name

Output print("Original text :", text)

print("Tokenized text :", tokens)
Displaying 1 of 1 matches: print("Filtered text :", filtered_tokens)
Who cannot want the thought , how monstrous It print("Validated text :", valid_tokens)
was for Malcolme , and for Dona. print("Alpha text :", alpha_tokens)
print("Stemmed text :", stemmer_tokens)

Vocabulary Count
Output
import nltk
text=("welcome to the world") Original text : This is a sample text that we used to
words = nltk.word_tokenize(text) demonstrate NLTK text processing 123
num_words=len(words) Tokenized text : ['This', 'is', 'a', 'sample', 'text', 'that',
num_the = words.count('the') 'we', 'used', 'to', 'demonstrate', 'NLTK', 'text',
unique_words=set(words) 'processing', '123']
num_unique_words=len(unique_words) Filtered text : ['sample', 'text', 'used', 'demonstrate',
percen_unique=(num_unique_words/num_words)* 'text']
100 Validated text : ['this', 'is', 'a', 'sample', 'text', 'that',
print(words) 'we', 'used', 'to', 'demonstrate', 'text']
print("the number of words:",num_words) Alpha text : ['this', 'is', 'a', 'sample', 'text', 'that', 'we',
print('number of occurence of "the":',num_the) 'used', 'to', 'demonstrate', 'nltk', 'text', 'processing']
print("number of unique
words:",num_unique_words)
print("percentage of unique
words:",percen_unique)
Bag of Words

from sklearn.feature_extraction.text import

Output
CountVectorizer
['welcome', 'to', 'the', 'world'] corpus = ["This is the first document",
the number of words: 4 "This document is the second document",
number of occurence of "the": 1 "And this is the third one",
number of unique words: 4 "Is this the first document"]
percentage of unique words: 100.0
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

Text Preprocessing for i in range(len(corpus)):

print(f"BoW representation of Document {i+1}:
import nltk {X[i].toarray()[0]}")
nltk.download('stopwords')
nltk.download('words')
from nltk.tokenize import word_tokenize Output
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer BoW representation of
from nltk.corpus import words Document 1: [0 1 1 1 0 0 1 0 1]
BoW representation of
text = 'This is a sample text that we used to Document 2: [0 2 0 1 0 1 1 0 1]
demonstrate NLTK text processing 123' BoW representation of
tokens = word_tokenize(text) Document 3: [1 0 0 1 1 0 1 1 1]
stop_words = set(stopwords.words('english')) BoW representation of
Document 4: [0 1 1 1 0 0 1 0 1]
# Corrected variable name
stemmer = PorterStemmer()
TF-IDF filtered_tokens = [token for token in tokens if
token.lower() not in stop_words]
from nltk.tokenize import word_tokenize lemmatizer = WordNetLemmatizer()
from nltk.corpus import stopwords lemmatized_tokens =
from nltk.stem import PorterStemmer [lemmatizer.lemmatize(token) for token in
from collections import Counter filtered_tokens]
import math pos_tags = nltk.pos_tag(lemmatized_tokens)
pos_word_corpus = [(word, tag) for word, tag in
def calculate_tf(word, document): pos_tags]
word_frequency = document.count(word)
return word_frequency / len(document) for word, tag in pos_word_corpus:
print(word, ":", tag)
def calculate_idf(word, corpus):
num_documents_containing_word = len([True
for document in corpus if word in document]) Output
if num_documents_containing_word == 0:
return 0 quick : JJ
else: brown : NN
return math.log10(len(corpus) / fox : JJ
num_documents_containing_word) jump : NN
lazy : NN
def calculate_tfidf(document, corpus): dog : NN
PS = PorterStemmer()
stop_words = set(stopwords.words('english'))
words = [PS.stem(word.lower()) for word in Named Entity Recognition
word_tokenize(document) if word.lower() not in
stop_words] import nltk
word_tfidf_values = {} nltk.download('averaged_perceptron_tagger')
for word in words: nltk.download('maxent_ne_chunker')
if word not in word_tfidf_values: nltk.download('words')
tf = calculate_tf(word, words) text="Josh works for Twitter in California."
idf = calculate_idf(word, corpus) tokens=nltk.word_tokenize(text)
word_tfidf_values[word] = tf * idf tagged=nltk.pos_tag(tokens)
return word_tfidf_values entities=nltk.chunk.ne_chunk(tagged)
for entity in entities:
corpus = [ "This is the first document", "This if hasattr(entity,'label'):
document is the second document", "And this is the print(entity.label(),''.join(c[0] for c in
third one", "Is this the first document" ] entity.leaves()))
document = "This is the second document"
tfidf_vector = calculate_tfidf(document, corpus)
print(tfidf_vector) Output

PERSON Josh
Output GPE Twitter
GPE California
{'second': 0.3010299956639812,
'document': 0.06246936830414996}

Pos Tagging

import nltk
nltk.download('averaged_perceptron_tagger')

from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

line = "quick brown fox jumps over the lazy dog"

tokens = nltk.word_tokenize(line)
stop_words = set(stopwords.words('english'))
Pos Tagging via HMM Chatbot

import nltk import nltk

nltk.download('brown') from nltk.chat.util import Chat,reflections
from nltk.corpus import brown pairs=[[r"Hello|hi|hey|hola",
["Hello,I am Aura,your AI assistant. How may I help
def train_hmm_tagger(): you?"]],
tagged_sentence = [r"How are you|How are you doing",
brown.tagged_sents(categories='news') ["I'm good, how about you?"]],
size = int(len(tagged_sentence) * 0.9) [r"What song always gets you in a good mood?",
trained_sents = tagged_sentence[:size] ['"Happy" by Pharrell Williams never fails to put a
test_sents = tagged_sentence[size:] smile on my face.']],
symbols = set([word for sentence in [r"Suggest a trending song",
tagged_sentence for word, _ in sentence]) ['Good 4 U by Olivia Rodrigo',
states = set([tag for sentence in 'Montero(Call Me By Your Name) by Lil Nas X',
tagged_sentence for _, tag in sentence]) 'Save Your Tears by The Weeknd',
trainer = 'Levitating by Dua Lipa']],
nltk.tag.hmm.HiddenMarkovModelTrainer(states=st [r"quit",["Good bye"]],
ates, symbols=symbols) [r"(.*)",["Could you try again?"]]]
hmm_tagger = bot=Chat(pairs,reflections)
trainer.train_supervised(trained_sents) bot.converse()
return hmm_tagger
Output
def pos_tag_sentence(sentence, hmm_tagger):
tokens = nltk.word_tokenize(sentence) >hi
tagged_tokens = hmm_tagger.tag(tokens) Hello,I am Aura,your AI assistant. How may I help
return tagged_tokens you?
>how are you
hmm_tagger = train_hmm_tagger() I'm good, how about you?
sentence = input("Enter the sentence to be >What song always gets you in a good mood?
tagged?") "Happy" by Pharrell Williams never fails to put a
tagged = pos_tag_sentence(sentence, smile on my face.
hmm_tagger) >Suggest a trending song
print(tagged) Save Your Tears by The Weeknd
>bye
Good bye
Output

Enter the sentence to be tagged?

The sky is so beautiful.

[('The', 'AT'), ('sky', 'NN'), ('is', 'BEZ'), ('so', 'QL'),

('beautiful', 'JJ')]
TEXT CLASSIFICATION USING LOGISTIC TEXT CLASSIFICATION USING NAÏVE
REGRESSION BAYES

from nltk.tokenize import word_tokenize import nltk

from nltk.corpus import stopwords from nltk.corpus import movie_reviews
from nltk.stem import PorterStemmer from nltk.corpus import stopwords
from sklearn.feature_extraction.text import from nltk.classify import NaiveBayesClassifier
CountVectorizer from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import from sklearn.model_selection import train_test_split
LogisticRegression nltk.download('movie_reviews')
def preprocess(text): nltk.download('stopwords')
ps = PorterStemmer() stop_words = set(stopwords.words('english'))
stop_words = set(stopwords.words('english')) lemmatizer = WordNetLemmatizer()
words = [word_tokenize(sentence) for sentence
in text] def preprocess(text):
filtered_words = [[ps.stem(word) for word in tokens = word_tokenize(text.lower())
tokenized if word not in stop_words and tokens = [lemmatizer.lemmatize(token) for token
word.isalpha()] for in tokens if token not in stop_words and
tokenized in words] token.isalpha()]
filtered_sentences = [' '.join(sentence) for return dict(nltk.FreqDist(tokens))
sentence in filtered_words] pos_reviews = [(movie_reviews.raw(fileid),
return filtered_sentences 'positive') for fileid in movie_reviews.fileids('pos')]
neg_reviews = [(movie_reviews.raw(fileid),
sentences = ["The food is tasty", "the quality of 'negative') for fileid in movie_reviews.fileids('neg')]
food is low", "i will never recommend their food", tot_rev = pos_reviews + neg_reviews
"I got sick after having their food", "I was processed_data = [(preprocess(text), category) for
in cloudnine after tasting their food", (text, category) in tot_rev]
"My favourite is their desserts", "the food train_data, val_data =
was not cooked properly"] train_test_split(processed_data, test_size=0.2,
classes = [1, 0, 0, 0, 1, 1, 0] random_state=42)
test_sentences = ["food is not cooked properly", "I
feel sick after having food", "I love their desserts", classifier = NaiveBayesClassifier.train(train_data)
"was in cloudnine after tasting their food"] new_text = ["The movie was amazing", "the movie
was terrible", "The movie was awful"]
vectorizer = CountVectorizer() for text in new_text:
sentences = preprocess(sentences) new_features = preprocess(text)
vect1 = vectorizer.fit_transform(sentences) predicted_category =
# Splitting data for testing classifier.classify(new_features)
# train_data, test_data, train_labels, test_labels = print(f"The predicted category for '{text}' is
train_test_split(vect1, classes, test_size=0.2, '{predicted_category}'")
random_state=42
nb = LogisticRegression() Output
nb.fit(vect1, classes)
test_sentences = preprocess(test_sentences) The predicted category for 'The movie was
vect2 = vectorizer.transform(test_sentences) amazing' is 'positive'
pred_classes = nb.predict(vect2) The predicted category for 'the movie was terrible'
print(pred_classes) is 'negative'
The predicted category for 'The movie was awful' is
Output 'negative’

[0 0 1 1]

English For Pharmacy and Parapharmacy PDF
No ratings yet
English For Pharmacy and Parapharmacy PDF
16 pages
U2T2 (Angular Measurement)
No ratings yet
U2T2 (Angular Measurement)
24 pages
Fisher DVC6000f Instruction Manual
No ratings yet
Fisher DVC6000f Instruction Manual
356 pages
Statistics For Engineers and Scientists William Navidi Instant Download
100% (5)
Statistics For Engineers and Scientists William Navidi Instant Download
59 pages
NLP - Cheatsheet
No ratings yet
NLP - Cheatsheet
10 pages
Attachment-S - Drive Control Philosophy
No ratings yet
Attachment-S - Drive Control Philosophy
14 pages
The Master of Animals in Old World Iconography
No ratings yet
The Master of Animals in Old World Iconography
21 pages
Joseph Henry
No ratings yet
Joseph Henry
11 pages
Autonomous Sylabus
No ratings yet
Autonomous Sylabus
190 pages
Dictionar Petrol English - Romanian
No ratings yet
Dictionar Petrol English - Romanian
117 pages
Asnt Level 3 Study Guide MT
0% (3)
Asnt Level 3 Study Guide MT
4 pages
SystemC and Codesign Additional Lectures
No ratings yet
SystemC and Codesign Additional Lectures
58 pages
EX - CX - 044 - Master Inspection Characteristics MIC
No ratings yet
EX - CX - 044 - Master Inspection Characteristics MIC
61 pages
Mathematics Grade 6 Handover Tool
No ratings yet
Mathematics Grade 6 Handover Tool
9 pages
(Dey, Pradip Ghosh, Manas) Computer Fundamentals (B-Ok - Xyz)
No ratings yet
(Dey, Pradip Ghosh, Manas) Computer Fundamentals (B-Ok - Xyz)
42 pages
MB0043 Human Resource Management Units 1-5
No ratings yet
MB0043 Human Resource Management Units 1-5
22 pages
Building Lighting Automation Through The Integration of DALI With Wireless Sensor Networks
No ratings yet
Building Lighting Automation Through The Integration of DALI With Wireless Sensor Networks
6 pages
Test Plan Template 02
No ratings yet
Test Plan Template 02
10 pages
The Type of Essay Will Depend On What The Writer Wants To Convey To His Reader. There Are Broadly Four Types of Essays. Let Us See
No ratings yet
The Type of Essay Will Depend On What The Writer Wants To Convey To His Reader. There Are Broadly Four Types of Essays. Let Us See
5 pages
Where Do You Write The Scope and Delimitations
No ratings yet
Where Do You Write The Scope and Delimitations
3 pages
Report Rubrics
No ratings yet
Report Rubrics
2 pages
Action Plan in LRMDS
95% (19)
Action Plan in LRMDS
2 pages
DLL - Mathematics 6 - Q2 - W4
No ratings yet
DLL - Mathematics 6 - Q2 - W4
6 pages
RRB: Loco Pilot Exam Syllabus Exam Pattern
No ratings yet
RRB: Loco Pilot Exam Syllabus Exam Pattern
1 page
MSDS - Sulphur 90%: Section 1. Product Information
No ratings yet
MSDS - Sulphur 90%: Section 1. Product Information
3 pages
03 The Modern Age of Microbiology
No ratings yet
03 The Modern Age of Microbiology
5 pages
The Disposal of Activated Carbon From Chemical Agent Disposal Facilities
No ratings yet
The Disposal of Activated Carbon From Chemical Agent Disposal Facilities
13 pages
1 - Write A Python Program To Perform Following Tasks On Text A) Tokenization
No ratings yet
1 - Write A Python Program To Perform Following Tasks On Text A) Tokenization
13 pages
C24064 - NLP - Lab Manual
No ratings yet
C24064 - NLP - Lab Manual
28 pages
DJP Da Iexperience and Skymobile Sy 2019-2020 0270 June 6/10, 2019
No ratings yet
DJP Da Iexperience and Skymobile Sy 2019-2020 0270 June 6/10, 2019
1 page
NLP Study Plan For Beginners - HW Samples
No ratings yet
NLP Study Plan For Beginners - HW Samples
47 pages
NLP 2-5 Unit Notes
No ratings yet
NLP 2-5 Unit Notes
83 pages
20BCP112 - NLP Lab - LAB - Manual
No ratings yet
20BCP112 - NLP Lab - LAB - Manual
65 pages
NLP Record
No ratings yet
NLP Record
23 pages
NLPPractical
No ratings yet
NLPPractical
12 pages
7 Ancient Indian Inventions That Will Surprise You
No ratings yet
7 Ancient Indian Inventions That Will Surprise You
2 pages
NLP Assignment 4 (22bce9560)
No ratings yet
NLP Assignment 4 (22bce9560)
12 pages
GRCon17 Program 1
No ratings yet
GRCon17 Program 1
1 page
Natural Language Processing Lab 9
No ratings yet
Natural Language Processing Lab 9
13 pages
Arabic 2 English
No ratings yet
Arabic 2 English
7 pages
Sahil NLP
No ratings yet
Sahil NLP
16 pages
x0 Process
No ratings yet
x0 Process
4 pages
NLP Op
No ratings yet
NLP Op
16 pages
Programs Code
No ratings yet
Programs Code
7 pages
Record
No ratings yet
Record
6 pages
DS 7
No ratings yet
DS 7
3 pages
Soundarya 256 NLP Practs
No ratings yet
Soundarya 256 NLP Practs
14 pages
Ai&Ml Bai601 NLP Lab Manual
No ratings yet
Ai&Ml Bai601 NLP Lab Manual
48 pages
NLP Lab1
No ratings yet
NLP Lab1
6 pages
123 NLP 456
No ratings yet
123 NLP 456
4 pages
NLP Using Python
No ratings yet
NLP Using Python
4 pages
NLP Pratical
No ratings yet
NLP Pratical
14 pages
7 TextAnalysis
No ratings yet
7 TextAnalysis
3 pages
TP1 NLP
No ratings yet
TP1 NLP
7 pages
R22 NLP Python Programs
No ratings yet
R22 NLP Python Programs
15 pages
AIML Unit5
No ratings yet
AIML Unit5
36 pages
NLP - Record (Weeks 1-12)
No ratings yet
NLP - Record (Weeks 1-12)
41 pages
NLP Expts
No ratings yet
NLP Expts
41 pages
115 Ir 7
No ratings yet
115 Ir 7
6 pages
NLP Lab
No ratings yet
NLP Lab
18 pages
NLP Lab Programs
No ratings yet
NLP Lab Programs
18 pages
Dsbda 7
No ratings yet
Dsbda 7
1 page
SK NLP Practical (FS)
No ratings yet
SK NLP Practical (FS)
22 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
19 pages
NLP Lab Manual
No ratings yet
NLP Lab Manual
15 pages
Self Evaluation Exercises
No ratings yet
Self Evaluation Exercises
12 pages
NLP
No ratings yet
NLP
4 pages
AP19110010110 Lab Assignment-2 - Jupyter Notebook
No ratings yet
AP19110010110 Lab Assignment-2 - Jupyter Notebook
18 pages
Machine Learning For NLP: Vocabulary
No ratings yet
Machine Learning For NLP: Vocabulary
37 pages
Final NLP Lab File
No ratings yet
Final NLP Lab File
28 pages
Alcatel 2801 Mainstreet Dtu: HDSL Data Termination Unit - Release 2.0
No ratings yet
Alcatel 2801 Mainstreet Dtu: HDSL Data Termination Unit - Release 2.0
2 pages
NLP FinAL
No ratings yet
NLP FinAL
27 pages
All Practicals
No ratings yet
All Practicals
33 pages
NLP Lab Assignment 8
No ratings yet
NLP Lab Assignment 8
14 pages
NLP Record
No ratings yet
NLP Record
15 pages
ASTW RA03 PracticalManual
No ratings yet
ASTW RA03 PracticalManual
18 pages
Lab2 IR
No ratings yet
Lab2 IR
16 pages
DSBD 7 Ass
No ratings yet
DSBD 7 Ass
9 pages
IR Assignment4
No ratings yet
IR Assignment4
5 pages
NLP - Practical List
No ratings yet
NLP - Practical List
14 pages
NLP
No ratings yet
NLP
12 pages
1a NLTK
No ratings yet
1a NLTK
10 pages
Natural Language Processing
No ratings yet
Natural Language Processing
17 pages
Shubham Jade MSC It 31031420010 NLP Practical Journal
No ratings yet
Shubham Jade MSC It 31031420010 NLP Practical Journal
17 pages
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
No ratings yet
Https Raw - Githubusercontent.com Joelgrus Data-Science-From-Scratch Master Code Natural Language Processing
5 pages
Natural Language Processing
No ratings yet
Natural Language Processing
22 pages
Python For Beginners
From Everand
Python For Beginners
Célio Azevedo
No ratings yet
The Essential R Reference
From Everand
The Essential R Reference
Mark Gardener
No ratings yet
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet

NLP Projects

Uploaded by

NLP Projects

Uploaded by

Text Concordance alpha_tokens = [token.

lower() for token in tokens if

Output print("Original text :", text)

from sklearn.feature_extraction.text import

Text Preprocessing for i in range(len(corpus)):

from nltk.corpus import stopwords

line = "quick brown fox jumps over the lazy dog"

import nltk import nltk

Enter the sentence to be tagged?

[('The', 'AT'), ('sky', 'NN'), ('is', 'BEZ'), ('so', 'QL'),

from nltk.tokenize import word_tokenize import nltk

You might also like