NLP Lab Programs
NLP Lab Programs
Convert the text into tokens and find the word frequency
def tokenize(text):
tokens = re.findall(r'\b\w+\b', text.lower())
return tokens
def word_frequency(tokens):
frequency = Counter(tokens)
return frequency
text = "This is a simple text. This text is for testing the word frequency program. This is simple."
tokens = tokenize(text)
frequency = word_frequency(tokens)
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
def tokenize(text):
tokens = word_tokenize(text.lower())
return tokens
def lemmatize(tokens):
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
return lemmatized_tokens
def stem(tokens):
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
return stemmed_tokens
text = "The striped bats are hanging on their feet for best."
tokens = tokenize(text)
lemmatized_tokens = lemmatize(tokens)
print("Lemmatized Tokens:")
print(lemmatized_tokens)
stemmed_tokens = stem(tokens)
print("\nStemmed Tokens:")
print(stemmed_tokens)
3. Implement Bi-gram
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import bigrams
nltk.download('punkt')
def generate_bigrams(text):
tokens = word_tokenize(text.lower())
bigram_list = list(bigrams(tokens))
return bigram_list
text = "The striped bats are hanging on their feet for best."
bigrams_result = generate_bigrams(text)
print("Bigrams:")
for bigram in bigrams_result:
print(bigram)
4. Identify parts-of Speech using Penn Treebank tag set
import nltk
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
sentence = "The quick brown fox jumps over the lazy dog."
words = word_tokenize(sentence)
tagged_words = pos_tag(words)
print("Sentence:", sentence)
print("Tagged Words:", tagged_words)
1. CC Coordinating conjunction
2. CD Cardinal number
3. DT Determiner
4. EX Existential there
5. FW Foreign word
Preposition or subordinating
6. IN
conjunction
7. JJ Adjective
8. JJR Adjective, comparative
9. JJS Adjective, superlative
10
LS List item marker
.
11
MD Modal
.
12
NN Noun, singular or mass
.
13
NNS Noun, plural
.
14
NNP Proper noun, singular
.
15 NNP
Proper noun, plural
. S
16
PDT Predeterminer
.
17
POS Possessive ending
.
18
PRP Personal pronoun
.
19 PRP
Possessive pronoun
. $
20
RB Adverb
.
21
RBR Adverb, comparative
.
22
RBS Adverb, superlative
.
23
RP Particle
.
24
SYM Symbol
.
25
TO to
.
26
UH Interjection
.
27
VB Verb, base form
.
28
VBD Verb, past tense
.
29
VBG Verb, gerund or present participle
.
30
VBN Verb, past participle
.
31
VBP Verb, non-3rd person singular present
.
32
VBZ Verb, 3rd person singular present
.
33 WD
Wh-determiner
. T
34
WP Wh-pronoun
.
35
WP$ Possessive wh-pronoun
.
36 WR
Wh-adverb
. B
5. Implement HMM for POS tagging and Build a Chunker
import nltk
from nltk.corpus import treebank
from collections import defaultdict
nltk.download('treebank')
tagged_sentences = treebank.tagged_sents()
states = set()
observations = set()
states = list(states)
observations = list(observations)
initial_counts = defaultdict(int)
for sentence in tagged_sentences:
initial_counts[sentence[0][1]] += 1
initial_probabilities = defaultdict(float)
total_count = sum(initial_counts.values())
for tag, count in initial_counts.items():
initial_probabilities[tag] = count / total_count
for y in states:
(prob, state) = max((V[t-1][y0] * trans_p[y0].get(y, 0) * emit_p[y].get(observations[t], 0),
y0) for y0 in states)
V[t][y] = prob
newpath[y] = path[state] + [y]
path = newpath
n = len(observations) - 1
(prob, state) = max((V[n][y], y) for y in states)
return (prob, path[state])
sentence = "the quick brown fox jumps over the lazy dog"
observations = sentence.lower().split()
prob, tags = viterbi(observations, states, initial_probabilities, transition_probabilities,
emission_probabilities)
print("Sentence:", sentence)
print("Tags:", tags)
sentence = "The quick brown fox jumps over the lazy dog"
chunks = chunk_sentence(sentence, grammar)
print(chunks)
6. Find the synonym of a word and antonym of a word using WordNet
import nltk
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
def get_synonyms(word):
synonyms = set()
for syn in wn.synsets(word):
for lemma in syn.lemmas():
synonyms.add(lemma.name())
return synonyms
def get_antonyms(word):
antonyms = set()
for syn in wn.synsets(word):
for lemma in syn.lemmas():
if lemma.antonyms():
antonyms.add(lemma.antonyms()[0].name())
return antonyms
word = "happy"
synonyms = get_synonyms(word)
antonyms = get_antonyms(word)
import spacy
# Example text
text = "Barack Obama was born in Hawaii. He was elected president in 2008."
import tensorflow as tf
print(tf.__version__)
!pip install numpy tensorflow keras scikit-learn nltk
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, TimeDistributed
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize
nltk.download('treebank')
nltk.download('punkt')
def load_data():
sentences = treebank.sents()
tags = treebank.tagged_sents()
unique_words = sorted(set(words))
unique_tags = sorted(set(tags_flattened))
word_to_index['<PAD>'] = 0
word_to_index['<UNK>'] = 1
tag_to_index['<PAD>'] = 0
model = Sequential()
model.add(Embedding(input_dim=len(word_to_index), output_dim=50,
input_length=X_train.shape[1]))
model.add(LSTM(units=100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
model.add(TimeDistributed(Dense(len(tag_to_index), activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Example usage
sentence = "Barack Obama was born in Hawaii."
print(predict_pos(sentence))
10. Develop a movie review system (sentiment analysis on movie data)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
data = load_movie_reviews()
# Map sentiments to binary labels
data['sentiment'] = data['sentiment'].map({'pos': 1, 'neg': 0})
def predict_sentiment(review):
prediction = pipeline.predict([review])
sentiment = 'Positive' if prediction[0] == 1 else 'Negative'
return sentiment
# Example usage
new_review = "The movie was fantastic! I really enjoyed the performances."
print(f"Review: {new_review}")
print(f"Predicted Sentiment: {predict_sentiment(new_review)}")