0% found this document useful (0 votes)
24 views16 pages

Sahil NLP

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
24 views16 pages

Sahil NLP

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 16

Name:- Vipin Rawat Course:- MCA Section:- 4E

Roll No:-

Q1:-Write a python program to tokenization to statement.

nltk.download('punkt')
from nltk.tokenize import word_tokenize
sentence = "Tokenize this sentence."
tokens = word_tokenize(sentence)
print(tokens)

Q2:-Remove the stopwords.

from nltk.corpus import stopwords


nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
filtered_sentence = [word for word in tokens if not word.lower() in stop_words]
print(filtered_sentence)

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
tokens = ["the", "quick", "brown", "fox", "jumps", "over", "the", "lazy", "dog"]
stop_words = set(stopwords.words('english'))
newfiltered_sentence = [word for word in tokens if not word.lower() in stop_words]
print(newfiltered_sentence)

Q3:-To carry out Stemming or Lemmatization.


Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

from nltk.stem import PorterStemmer


ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in tokens]
print(stemmed_words)

Q4:-To Carry out parts of speech tagging.

nltk.download('averaged_perceptron_tagger')
tagged_words = nltk.pos_tag(tokens)
print(tagged_words)

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
sentence = "This is a sample sentence"
tokens = nltk.word_tokenize(sentence)
tagged_words = nltk.pos_tag(tokens)
print(tagged_words)

Q5:-To carry out chunking of the word based on parts of Speech tagging.

nltk.download('maxent_ne_chunker')
nltk.download('words')
ne_chunks = nltk.ne_chunk(tagged_words)
print(ne_chunks)

Q6:-Regular expression tagger.

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, RegexpParser
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

def tokenize_and_chunk_input():
user_input = input("Enter a sentence: ")
tokens = word_tokenize(user_input)
tagged_tokens = pos_tag(tokens)
grammar = r"""
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
CLAUSE: {<NP><VP>}
"""
chunk_parser = RegexpParser(grammar)
chunks = chunk_parser.parse(tagged_tokens)
return chunks
if __name__ == "__main__":
print("Tokenizing and chunking input from the user using NLTK...")
chunked_input = tokenize_and_chunk_input()
print("Chunked input:", chunked_input)

nltk.download('maxent_ne_chunker')
nltk.download('words')
ne_chunks = nltk.ne_chunk(tagged_words)
print(ne_chunks)

Q7:-Write a program to take input user and to carry out all the basic operations of NLP.

import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag, RegexpParser
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
def tokenize_and_chunk_input():
user_input = input("Enter a sentence: ")
tokens = word_tokenize(user_input)
tagged_tokens = pos_tag(tokens)
grammar = r"""
NP: {<DT|JJ|NN.*>+}
PP: {<IN><NP>}
VP: {<VB.*><NP|PP|CLAUSE>+$}
CLAUSE: {<NP><VP>}
"""
chunk_parser = RegexpParser(grammar)
chunks = chunk_parser.parse(tagged_tokens)
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

return chunks
if __name__ == "__main__":
print("Tokenizing and chunking input from the user using NLTK...")
chunked_input = tokenize_and_chunk_input()
print("Chunked input:", chunked_input)

Q8:- To calculate the TFID Term frequency inverse documents frequency given set of sentences.

from sklearn.feature_extraction.text import TfidfVectorizer


documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
print(tfidf_matrix.toarray())

Q9:-Write a program to count vectorsation and transform each document into a vector of word
count.
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

from sklearn.feature_extraction.text import CountVectorizer


documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(documents)
print(bow_matrix.toarray())

Q10:-Write a python program ro find similar word using word to word modle.

from gensim.models import Word2Vec


sentences = [
["this", "is", "the", "first", "sentence", "for", "word2vec"],
["this", "is", "the", "second", "sentence"],
["yet", "another", "sentence"],
["one", "more", "sentence"],
["and", "the", "final", "sentence"],
]
model = Word2Vec(sentences, min_count=1)
similar_words = model.wv.most_similar("sentence")
print(similar_words)

Q11:-Write a python program to find similar word from given two paragraph.

import nltk
from nltk.corpus import gutenberg
nltk.download('gutenberg')
nltk.download('punkt')
paragraph1 = "Hello and welcome! How can I assist you today?"
paragraph2 = "Greetings! I'm here to help. What can I do for you?"
words_paragraph1 = set(nltk.word_tokenize(paragraph1.lower()))
words_paragraph2 = set(nltk.word_tokenize(paragraph2.lower()))
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

similar_words = words_paragraph1.intersection(words_paragraph2)
print("Similar words between the two paragraphs:")
print(similar_words)

Q12:-Write a python program to tokenize the given sentence using two methods sentence by
sentence.

import nltk
nltk.download('punkt')
def tokenize_sentences(text):
sentences = nltk.sent_tokenize(text)
return sentences
text = "Tokenization is the process of breaking text into sentences. It's an important step in natural
language processing."
sentences = tokenize_sentences(text)
for sentence in sentences:
print(sentence)

Q13:-Write a python program to take a sample of atleast 5 line and tokenization either by word or
sentence.

import nltk
nltk.download('punkt')
def tokenize_sentences(text):
sentences = nltk.sent_tokenize(text)
return sentences
text = "Tokenization is the process of breaking text into sentences. It's an important step in natural
language processing. Gjh. yhfh. Jhyfhu "
sentences = tokenize_sentences(text)
for sentence in sentences:
print(sentence)

nltk.sent_tokenize(text)
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

nltk.word_tokenize(text)

def tokenize_sentences(text):
sentences = nltk.sent_tokenize(text)
return sentences
def tokenize_words(sentence):
words = nltk.word_tokenize(sentence)
return words

for sentence in sentences:


print(sentence)
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

Q14:-Write a python program to download sample text from guntaberg and tokenization by send
as well as verb. The sample text

import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
sample_text = gutenberg.raw('shakespeare-hamlet.txt')[:1000]
tokens = word_tokenize(sample_text)
tagged_tokens = pos_tag(tokens)
nouns = [word for word, pos in tagged_tokens if pos.startswith('NN')]
verbs = [word for word, pos in tagged_tokens if pos.startswith('VB')]
print("Nouns:")
print(nouns)
print("\nVerbs:")
print(verbs)

Q15:-Write a python program to remove stopwords from a given text using build in stopwords list
from NLTK.

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
def remove_stopwords(text):
stop_words = set(stopwords.words('english'))
words = text.split()
filtered_text = [word for word in words if word.lower() not in stop_words]
return ' '.join(filtered_text)
text = "This is a sample sentence with some stop words that need to be removed."
filtered_text = remove_stopwords(text)

print("Filtered text:", filtered_text)


Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

Q16:-Write a python program to remove stopwords from customize list of stopwords.

def remove_stopwords_custom(text, custom_stopwords):


words = text.split()
filtered_text = [word for word in words if word.lower() not in custom_stopwords]
return ' '.join(filtered_text)
custom_stopwords = {'is', 'a', 'with', 'to', 'be'}
filtered_text_custom = remove_stopwords_custom(text, custom_stopwords)
print("Filtered text (custom):", filtered_text_custom)

Q17:-Write a python program using above library of stopwords.

def remove_stopwords_custom(text, custom_stopwords):


words = text.split()
filtered_text = [word for word in words if word.lower() not in custom_stopwords]
return ' '.join(filtered_text)
custom_stopwords = {'is', 'a', 'with', 'to', 'be'}
filtered_text_custom = remove_stopwords_custom(text, custom_stopwords)
print("Filtered text (custom):", filtered_text_custom)

import spacy
nlp = spacy.load("en_core_web_sm")
text = "This is a sample sentence with some stop words that need to be removed."
def remove_stopwords_spacy(text):
doc = nlp(text)
filtered_text = [token.text for token in doc if not token.is_stop]
return ' '.join(filtered_text)
filtered_text_spacy = remove_stopwords_spacy(text)
print("Filtered text (spaCy):", filtered_text_spacy)

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS


def remove_stopwords_sklearn(text):
stop_words = set(ENGLISH_STOP_WORDS)
words = text.split()
filtered_text = [word for word in words if word.lower() not in stop_words]
return ' '.join(filtered_text)
filtered_text_sklearn = remove_stopwords_sklearn(text)
print("Filtered text (scikit-learn):", filtered_text_sklearn)
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

from gensim.parsing.preprocessing import remove_stopwords as gensim_remove_stopwords


filtered_text_gensim = gensim_remove_stopwords(text)
print("Filtered text (Gensim):", filtered_text_gensim)

from nltk.stem import SnowballStemmer


def nltk_snowball_stemming(text):
stemmer = SnowballStemmer("english")
tokens = text.split()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
return " ".join(stemmed_tokens)
text = "I am going to the store to buy some apples and oranges"
stemmed_text = nltk_snowball_stemming(text)
print(stemmed_text)

Q18:-Write a python program to describe the stemming using porter Stemming.

from nltk.stem import PorterStemmer


def nltk_stemming(text):
stemmer = PorterStemmer()
tokens = text.split()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
return " ".join(stemmed_tokens)
text = "I am going to the store to buy some apples and oranges"
stemmed_text = nltk_stemming(text)
print(stemmed_text)

Q19:-Write a python program to describe the stemming using Lancaster Stemming.

from nltk.stem import LancasterStemmer


lancaster_stemmer = LancasterStemmer()
words = ["running", "flies", "swimming", "happier", "cats", "dogs"]
stemmed_words = [lancaster_stemmer.stem(word) for word in words]

for original, stemmed in zip(words, stemmed_words):


print(f"Original: {original}, Stemmed: {stemmed}")
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

Q20:-Write a python program to describe the stemming using Regular expression Stemming.

import re
def regex_stemmer(word):
patterns = [
(r's$', ''),
(r'ed$', ''),
(r'ing$', '')
]
for pattern, replacement in patterns:
if re.search(pattern, word):
return re.sub(pattern, replacement, word)
return word
words = ["running", "flies", "swimming", "happier", "cats", "dogs"]
stemmed_words = [regex_stemmer(word) for word in words]
for original, stemmed in zip(words, stemmed_words):
print(f"Original: {original}, Stemmed: {stemmed}")

from sklearn.feature_extraction.text import TfidfVectorizer


documents = [
"This is the first document.",
"This document is the second document.",
"And this is the third one.",
"Is this the first document?",
]
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

import nltk
nltk.download('punkt')
nltk.download('hmm_treebank_pos_tagger')
nltk.download('averaged_perceptron_tagger')
hmm_tagger = nltk.tag.HiddenMarkovModelTagger.train([default_pos_tags])
hmm_pos_tags = hmm_tagger.tag(words)
print("\nStochastic POS tagging (Hidden Markov Model):")
print(hmm_pos_tags)

Q21:-Write a python program to POS tagging.

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
text = "The quick brown fox jumps over the lazy dog."
words = word_tokenize(text)
default_pos_tags = pos_tag(words)
print("Default POS tagging:")
print(default_pos_tags)
print("\nRule-based POS tagging (same as default tagger):")
print(default_pos_tags)
nltk.download('hmm_treebank_pos_tagger')
hmm_tagger = nltk.tag.HiddenMarkovModelTagger.train([default_pos_tags])
hmm_pos_tags = hmm_tagger.tag(words)
print("\nStochastic POS tagging (Hidden Markov Model):")
print(hmm_pos_tags)
print("\nStochastic POS tagging (Hidden Markov Model):")
print(hmm_pos_tags)
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return None
wordnet_tags = pos_tag(words)
wordnet_pos_tags = [(word, get_wordnet_pos(tag)) for word, tag in wordnet_tags]
print("\nDictionary-based POS tagging (WordNet):")
print(wordnet_pos_tags)

Q22:-Write a python program to develop chatbot that helps to diagnosis simple flue symptoms.

import random
greetings = ["Hello!", "Hi there!", "Welcome!", "Greetings!"]
common_questions = [
"What is your name?",
"How can I help you today?",
"What symptoms are you experiencing?",
"Do you have any allergies?",
"Are you currently taking any medications?",
]
responses = [
"I'm sorry, I'm just a chatbot and cannot provide medical advice. It's best to consult with a healthcare
professional.",
"Please consult with a doctor for proper diagnosis and treatment.",
"It's important to seek medical attention for your condition.",
"I recommend reaching out to a healthcare professional to discuss your concerns.",
]
def get_random_greeting():
return random.choice(greetings)
def respond(user_input):
if user_input.endswith("?"):
return random.choice(responses)
else:
return random.choice(common_questions)
def chat():
print(get_random_greeting())
while True:
user_input = input(">")
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

if user_input.lower() == "exit":
break
print(respond(user_input))
chat()

Q22:-Write a python program to create an interactive health chatbot with possible diagnosis and
home treatment for the probably symptoms.

class HealthChatbot:
def __init__(self):
self.symptoms = []
def greet_user(self):
print("Hello! I am your health chatbot. Let's check your symptoms.")
def ask_symptoms(self):
print("Please answer the following questions with 'yes' or 'no'.")
self.symptoms.append(input("Do you have a fever? ").lower())
self.symptoms.append(input("Do you have a cough? ").lower())
self.symptoms.append(input("Do you have difficulty breathing? ").lower())
def diagnose(self):
fever = self.symptoms[0] == 'yes'
cough = self.symptoms[1] == 'yes'
difficulty_breathing = self.symptoms[2] == 'yes'
if fever and cough and difficulty_breathing:
print("Based on your symptoms, you may have pneumonia. Please consult a doctor immediately.")
elif fever and cough:
print("Based on your symptoms, you may have a common cold or flu. Get plenty of rest and
fluids.")
elif difficulty_breathing:
print("Based on your symptoms, you may have a respiratory issue. Seek medical attention
promptly.")
else:
print("Based on your symptoms, you seem to be generally healthy. However, if you feel unwell,
consult a doctor.")
def start(self):
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

self.greet_user()
self.ask_symptoms()
self.diagnose()
if __name__ == "__main__":
chatbot = HealthChatbot()
chatbot.start()

Q23:-Write a python program using SVM and TFIDM analysis the given corpus of words.

from sklearn.datasets import fetch_20newsgroups


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']
news = fetch_20newsgroups(categories=categories)
X_train, X_test, y_train, y_test = train_test_split(news.data, news.target, test_size=0.2, random_state=42)
model = make_pipeline(TfidfVectorizer(), SVC())
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Q24:-Write a python program to calculate bag of words after carry out the preprocessing of the
text as following convert all the text into lowercase alphabet and replace all the punctuation with a
Space.

from collections import Counter


def preprocess_text(text):
text = text.upper()
for punctuation in '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~':
text = text.replace(punctuation, ' ')
return text
def create_bag_of_words(text):
text = preprocess_text(text)
words = text.split()
bag_of_words = Counter(words)
return bag_of_words
Name:- Vipin Rawat Course:- MCA Section:- 4E
Roll No:-

text = "HOW how is the boss is a .Simple Example is sometimes better than better."
bow = create_bag_of_words(text)
print("Bag of Words:")
for word, count in bow.items():
print(f"{word}: {count}")

You might also like