0% found this document useful (0 votes)
16 views

NLP Lab Programs

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views

NLP Lab Programs

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 18

1.

Convert the text into tokens and find the word frequency

from collections import Counter


import re

def tokenize(text):
tokens = re.findall(r'\b\w+\b', text.lower())
return tokens

def word_frequency(tokens):
frequency = Counter(tokens)
return frequency

text = "This is a simple text. This text is for testing the word frequency program. This is simple."

tokens = tokenize(text)

frequency = word_frequency(tokens)

for word, count in frequency.items():


print(f"{word}: {count}")
2. Perform Lemmatization and Stemming

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

def tokenize(text):
tokens = word_tokenize(text.lower())
return tokens

def lemmatize(tokens):
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
return lemmatized_tokens

def stem(tokens):
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
return stemmed_tokens

text = "The striped bats are hanging on their feet for best."

tokens = tokenize(text)

lemmatized_tokens = lemmatize(tokens)
print("Lemmatized Tokens:")
print(lemmatized_tokens)

stemmed_tokens = stem(tokens)
print("\nStemmed Tokens:")
print(stemmed_tokens)
3. Implement Bi-gram

import nltk
from nltk.tokenize import word_tokenize
from nltk.util import bigrams

nltk.download('punkt')

def generate_bigrams(text):
tokens = word_tokenize(text.lower())

bigram_list = list(bigrams(tokens))

return bigram_list

text = "The striped bats are hanging on their feet for best."

bigrams_result = generate_bigrams(text)

print("Bigrams:")
for bigram in bigrams_result:
print(bigram)
4. Identify parts-of Speech using Penn Treebank tag set

pip install nltk

import nltk
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

sentence = "The quick brown fox jumps over the lazy dog."

words = word_tokenize(sentence)

tagged_words = pos_tag(words)

print("Sentence:", sentence)
print("Tagged Words:", tagged_words)

POS Tags defined in treebank

1. CC Coordinating conjunction
2. CD Cardinal number
3. DT Determiner
4. EX Existential there
5. FW Foreign word
Preposition or subordinating
6. IN
conjunction
7. JJ Adjective
8. JJR Adjective, comparative
9. JJS Adjective, superlative
10
LS List item marker
.
11
MD Modal
.
12
NN Noun, singular or mass
.
13
NNS Noun, plural
.
14
NNP Proper noun, singular
.
15 NNP
Proper noun, plural
. S
16
PDT Predeterminer
.
17
POS Possessive ending
.
18
PRP Personal pronoun
.
19 PRP
Possessive pronoun
. $
20
RB Adverb
.
21
RBR Adverb, comparative
.
22
RBS Adverb, superlative
.
23
RP Particle
.
24
SYM Symbol
.
25
TO to
.
26
UH Interjection
.
27
VB Verb, base form
.
28
VBD Verb, past tense
.
29
VBG Verb, gerund or present participle
.
30
VBN Verb, past participle
.
31
VBP Verb, non-3rd person singular present
.
32
VBZ Verb, 3rd person singular present
.
33 WD
Wh-determiner
. T
34
WP Wh-pronoun
.
35
WP$ Possessive wh-pronoun
.
36 WR
Wh-adverb
. B
5. Implement HMM for POS tagging and Build a Chunker
import nltk
from nltk.corpus import treebank
from collections import defaultdict

nltk.download('treebank')

tagged_sentences = treebank.tagged_sents()

states = set()
observations = set()

for sentence in tagged_sentences:


for word, tag in sentence:
states.add(tag)
observations.add(word.lower())

states = list(states)
observations = list(observations)

transition_counts = defaultdict(lambda: defaultdict(int))


for sentence in tagged_sentences:
prev_tag = "<s>"
for word, tag in sentence:
transition_counts[prev_tag][tag] += 1
prev_tag = tag
transition_counts[prev_tag]["</s>"] += 1

transition_probabilities = defaultdict(lambda: defaultdict(float))


for prev_tag, next_tags in transition_counts.items():
total_count = sum(next_tags.values())
for next_tag, count in next_tags.items():
transition_probabilities[prev_tag][next_tag] = count / total_count
emission_counts = defaultdict(lambda: defaultdict(int))
for sentence in tagged_sentences:
for word, tag in sentence:
emission_counts[tag][word.lower()] += 1

emission_probabilities = defaultdict(lambda: defaultdict(float))


for tag, words in emission_counts.items():
total_count = sum(words.values())
for word, count in words.items():
emission_probabilities[tag][word] = count / total_count

initial_counts = defaultdict(int)
for sentence in tagged_sentences:
initial_counts[sentence[0][1]] += 1

initial_probabilities = defaultdict(float)
total_count = sum(initial_counts.values())
for tag, count in initial_counts.items():
initial_probabilities[tag] = count / total_count

def viterbi(observations, states, start_p, trans_p, emit_p):


V = [{}]
path = {}

for state in states:


V[0][state] = start_p[state] * emit_p[state].get(observations[0], 0)
path[state] = [state]

for t in range(1, len(observations)):


V.append({})
newpath = {}

for y in states:
(prob, state) = max((V[t-1][y0] * trans_p[y0].get(y, 0) * emit_p[y].get(observations[t], 0),
y0) for y0 in states)
V[t][y] = prob
newpath[y] = path[state] + [y]

path = newpath

n = len(observations) - 1
(prob, state) = max((V[n][y], y) for y in states)
return (prob, path[state])

sentence = "the quick brown fox jumps over the lazy dog"
observations = sentence.lower().split()
prob, tags = viterbi(observations, states, initial_probabilities, transition_probabilities,
emission_probabilities)
print("Sentence:", sentence)
print("Tags:", tags)

from nltk import pos_tag, word_tokenize


from nltk.chunk import RegexpParser

def chunk_sentence(sentence, grammar):


words = word_tokenize(sentence)
tagged = pos_tag(words)
parser = RegexpParser(grammar)
chunks = parser.parse(tagged)
return chunks

NP: {<DT|JJ|NN.*>+} # Chunk sequences of DT, JJ, NN


VP: {<VB.*><NP|PP|CLAUSE>+$} # Chunk verbs and their arguments
PP: {<IN><NP>} # Chunk prepositions followed by NP
CLAUSE: {<NP><VP>} # Chunk NP, VP
"""

sentence = "The quick brown fox jumps over the lazy dog"
chunks = chunk_sentence(sentence, grammar)
print(chunks)
6. Find the synonym of a word and antonym of a word using WordNet

import nltk
from nltk.corpus import wordnet as wn

nltk.download('wordnet')
nltk.download('omw-1.4')

def get_synonyms(word):
synonyms = set()
for syn in wn.synsets(word):
for lemma in syn.lemmas():
synonyms.add(lemma.name())
return synonyms

def get_antonyms(word):
antonyms = set()
for syn in wn.synsets(word):
for lemma in syn.lemmas():
if lemma.antonyms():
antonyms.add(lemma.antonyms()[0].name())
return antonyms

word = "happy"

synonyms = get_synonyms(word)
antonyms = get_antonyms(word)

print(f"Synonyms of '{word}':", synonyms)


print(f"Antonyms of '{word}':", antonyms)

7. Implement semantic role labeling to identify named entities (Same as Program-9)

Goto Command Prompt

Install Python 3.12.4

Set the path where your programs folder is runned

C:>>pip install spacy

C:>> python -m spacy download en_core_web_sm

C:>> python –version

C:>> pip --version

C:>> pip show spacy


C:>> import spacy

C:>> pip install jupyter

C:>> jupyter notebook

• This will open the Jupyter Environment


• Create a file “test_spacy.ipnyb” file

Paste the following code in single row

import spacy

# Load the spaCy model


nlp = spacy.load("en_core_web_sm")

# Example text
text = "Barack Obama was born in Hawaii. He was elected president in 2008."

# Process the text


doc = nlp(text)

# Extract named entities


named_entities = [(ent.text, ent.label_) for ent in doc.ents]

# Print named entities


print("Named Entities:", named_entities)

C:>> python test_spacy.py


8. Implement POS tagging using LSTM

!pip install tensorflow

import tensorflow as tf
print(tf.__version__)
!pip install numpy tensorflow keras scikit-learn nltk

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, TimeDistributed
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize

nltk.download('treebank')
nltk.download('punkt')

def load_data():
sentences = treebank.sents()
tags = treebank.tagged_sents()

words = [word.lower() for sentence in sentences for word in sentence]


tags_flattened = [tag for sentence in tags for _, tag in sentence]

unique_words = sorted(set(words))
unique_tags = sorted(set(tags_flattened))

word_to_index = {word: i + 2 for i, word in enumerate(unique_words)}


tag_to_index = {tag: i + 1 for i, tag in enumerate(unique_tags)}

word_to_index['<PAD>'] = 0
word_to_index['<UNK>'] = 1
tag_to_index['<PAD>'] = 0

index_to_word = {i: word for word, i in word_to_index.items()}


index_to_tag = {i: tag for tag, i in tag_to_index.items()}
X = [[word_to_index.get(word, 1) for word in sentence] for sentence in sentences]
y = [[tag_to_index[tag] for word, tag in sentence] for sentence in tags]

max_len = max(len(sentence) for sentence in X)


X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')
y = to_categorical(y, num_classes=len(tag_to_index))

return X, y, word_to_index, tag_to_index, index_to_word, index_to_tag

# Load data and mappings


X, y, word_to_index, tag_to_index, index_to_word, index_to_tag = load_data()

# Split the data


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Embedding(input_dim=len(word_to_index), output_dim=50,
input_length=X_train.shape[1]))
model.add(LSTM(units=100, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))
model.add(TimeDistributed(Dense(len(tag_to_index), activation='softmax')))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model


history = model.fit(X_train, y_train, batch_size=32, epochs=5, validation_split=0.1, verbose=1)

# Evaluate the model


loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
# Predict POS tags for a new sentence
def predict_pos(sentence):
tokens = word_tokenize(sentence.lower())
indices = [word_to_index.get(token, 1) for token in tokens]
indices = pad_sequences([indices], maxlen=X_train.shape[1], padding='post')
predictions = model.predict(indices)
predicted_tags = [index_to_tag[np.argmax(tag)] for tag in predictions[0]]
return list(zip(tokens, predicted_tags))

# Example usage
sentence = "Barack Obama was born in Hawaii."
print(predict_pos(sentence))
10. Develop a movie review system (sentiment analysis on movie data)

!pip install numpy pandas scikit-learn nltk

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')

from nltk.corpus import movie_reviews


from sklearn.datasets import load_files

# Load the movie reviews dataset from NLTK


def load_movie_reviews():
reviews = []
labels = []

for fileid in movie_reviews.fileids():


category = fileid.split('/')[0] # 'pos' or 'neg'
with movie_reviews.open(fileid) as f:
review = f.read()
reviews.append(review)
labels.append(category)

return pd.DataFrame({'review': reviews, 'sentiment': labels})

data = load_movie_reviews()
# Map sentiments to binary labels
data['sentiment'] = data['sentiment'].map({'pos': 1, 'neg': 0})

# Split the data into training and testing sets


X = data['review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Build a pipeline with CountVectorizer and MultinomialNB
pipeline = Pipeline([
('vectorizer', CountVectorizer()), # Converts text to feature vectors
('classifier', MultinomialNB()) # Naive Bayes classifier
])

# Train the model


pipeline.fit(X_train, y_train)

# Make predictions on the test set


y_pred = pipeline.predict(X_test)

# Evaluate the model


print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))

def predict_sentiment(review):
prediction = pipeline.predict([review])
sentiment = 'Positive' if prediction[0] == 1 else 'Negative'
return sentiment

# Example usage
new_review = "The movie was fantastic! I really enjoyed the performances."
print(f"Review: {new_review}")
print(f"Predicted Sentiment: {predict_sentiment(new_review)}")

You might also like