All Practicals
All Practicals
Task:
Write a C Program Code to,
Count the occurrence frequency of any specific word token (e.g. “AAB”)
Count the occurrence frequency of all the unique words / tokens in the file.
Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main()
FILE *fptr;
char path[1000];
char word[1000];
int wCount;
printf("Enter file path: ");
scanf("%s", path);
scanf("%s", word);
if (fptr == NULL)
exit(EXIT_FAILURE);
fclose(fptr);
return 0;
char str[BUFFER_SIZE];
char *pos;
count = 0;
index = 0;
count++;
return count;
#include <stdio.h>
#include <string.h>
int main()
{
FILE* filePointer;
char dataToBeRead[1000];
if (filePointer == NULL) {
else {
printf("---------------------------\n");
!= NULL) {
printf("%s", dataToBeRead);
fclose(filePointer);
return 0;
Conclusion: The program successfully displayed the content of the sample .TXT file, provided
the counts of total words and unique words, reported the frequency of the specified word
token ("AAB"), and presented the occurrence frequency of all unique words in the file.
08/08/23 22/08/23
Practical 2 : Text Preprocessing
Task:
Take any arbitrary string and perform the following task on it:
Count the number of tokens in the string (using split function and word tokenizer)
Take any news corpus, Pre-process it (All functionality Needed capitalization, contraction expansion,
punctuation removal, stop words)
Calculate Term Frequency for each term in the news corpus. (Is it pointing to the Topic of the corpus?)
Code
# pip install langdetect nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')
!pip install langdetect
print("Language:", identify_language(text))
print("Language:", identify_language(text2))
Language: en
Language: hi
print(count_length(text))
24
def count_tokens(text):
tokens = word_tokenize(text)
return len(tokens)
print(count_tokens(text))
print(count_unique_tokens(text))
def preprocess_corpus(corpus):
# Tokenize and remove stopwords and punctuation
tokens = word_tokenize(corpus)
tokens = [token.lower() for token in tokens if token.isalpha()]
tokens = [token for token in tokens if token not in
stopwords.words('english')]
return tokens
def calculate_term_frequency(tokens):
freq_dist = FreqDist(tokens)
term_freq = {word: freq for word, freq in freq_dist.items()}
return term_freq
contractions = {
"don't": "do not",
"doesn't": "does not",
"can't": "cannot",
"won't": "will not",
"haven't": "have not",
"hasn't": "has not",
"couldn't": "could not",
"shouldn't": "should not",
"wouldn't": "would not",
"it's": "it is",
"I'm": "I am",
"you're": "you are",
"they're": "they are",
"we're": "we are"
}
def expand_contractions(text):
words = text.split()
expanded_words = []
for word in words:
if word.lower() in contractions:
expanded_words.extend(contractions[word.lower()].split())
else:
expanded_words.append(word)
expanded_text = ' '.join(expanded_words)
return expanded_text
expanded_text = expand_contractions(contraction_text)
print("Expanded Text:", expanded_text)
22/08/23 29/08/23
Practical 3 : WordNet for Synonym and Antonym Detection
Task:
Find the synonym /antonym of a word using WordNet.
Code
!pip install nltk spacy
!python -m spacy download en
nltk.download('wordnet')
import nltk
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer
import spacy
# NLTK Stemmers
porter = PorterStemmer()
snowball = SnowballStemmer("english")
# NLTK Lemmatizer
lemmatizer = WordNetLemmatizer()
# spaCy Lemmatizer
nlp = spacy.load("en_core_web_sm")
if synsets:
print("Synonyms:")
for synset in synsets:
synonyms = [lemma.name() for lemma in synset.lemmas()]
print(", ".join(synonyms))
Synonyms:
happy
felicitous, happy
glad, happy
happy, well-chosen
Antonyms: unhappy
29/08/23 26/09/23
Code
def readData():
data = ['This is a dog','This is a cat','I love my cat','This is my
name ']
dat=[]
for i in range(len(data)):
for word in data[i].split():
dat.append(word)
print(dat)
return dat
def createBigram(data):
listOfBigrams = []
bigramCounts = {}
unigramCounts = {}
for i in range(len(data)-1):
if i < len(data) - 1 and data[i+1].islower():
if __name__ == '__main__':
data = readData()
#data = ['this','is','my','cat']
listOfBigrams, unigramCounts, bigramCounts = createBigram(data)
outputProb1 *= bigramProb[bilist[i]]
else:
outputProb1 *= 0
print('\n' + 'Probablility of sentence = ' +inputList +
str(outputProb1))
def readData():
data = ['there is a big garden' ,'children play in a garden','they
play inside beautiful garden']
dat=[]
for i in range(len(data)):
for word in data[i].split():
dat.append(word)
print(dat)
return dat
if __name__ == '__main__':
data = readData()
listOfBigrams, unigramCounts, bigramCounts = createBigram(data)
outputProb1 *= bigramProb[bilist[i]]
else:
outputProb1 *= 0
print('\n' + 'Probablility of sentence \"\" = ' + str(outputProb1))
def createTrigram(data):
listOfTrigrams = []
trigramCounts = {}
bigramCounts = {}
unigramCounts = {}
if data[i] in unigramCounts:
unigramCounts[data[i]] += 1
else:
unigramCounts[data[i]] = 1
# Example data
data = ["The", "quick", "brown", "fox", "jumps", "over", "the", "lazy",
"dog"]
26/09/23 03/10/23
Take any text corpus and calculate one hot encoded vector, calculate TD matrix, TF-IDF for some token
terms, PPMI for finding corresponding word of a given word, use Word2Vec for word embedding.
Code
from sklearn.feature_extraction.text import TfidfVectorizer
doc_corpus=[d1,d2,d3]
print(doc_corpus)
vec=TfidfVectorizer(stop_words='english')
matrix=vec.fit_transform(doc_corpus)
print("Feature Names n",vec.get_feature_names_out())
print("Sparse Matrix n",matrix.shape,"n",matrix.toarray())
import pandas as pd
import numpy as np
corpus = ['data science is one of the most important fields of science',
'this is one of the best data science courses',
'data scientists analyze data' ]
create a word set for corpus
words_set = set()
for doc in corpus:
words = doc.split(' ')
words_set = words_set.union(set(words))
# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)
# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())
Conclusion: In conclusion, the implemented Python code successfully demonstrated various
word vectorization techniques, including one-hot encoding, term-document matrix (TD), term
frequency-inverse document frequency (TF-IDF), positive pointwise mutual information (PPMI),
and Word2Vec, showcasing the versatility of these methods in capturing semantic relationships
and contextual information within a given text corpus.
3/10/23 17/10/23
Code
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
def classify_sentiment(sentence):
analyzer = SentimentIntensityAnalyzer()
sentiment_scores = analyzer.polarity_scores(sentence)
import nltk
nltk.download('opinion_lexicon')
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())
def preprocess_sentence(sentence):
# Convert the sentence to lowercase and split it into words
words = sentence.lower().split()
return words
def classify_sentiment(sentence):
words = preprocess_sentence(sentence)
# Example usage:
'''sentence = "I love this product, it's amazing!"
sentiment = classify_sentiment(sentence)
print(f"Sentence sentiment: {sentiment}")
Conclusion: The sentiment detection task was implemented using two different
approaches, and their performance was compared, revealing insights into the
effectiveness of each method for accurately identifying the sentiment of a given
sentence.
03/10/23 17/10/23
Task 2: How HMM can be used for POS Tagging? Illustratepython code for Transition Probability and
Emmision Probability Calculation.
Code
# Use simple NLTK POS tagger(Any readymade function) for identifying POS
tag of the input sentence.
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
stop_words = set(stopwords.words('english'))
# Dummy text
tokenized = sent_tokenize(txt1)
for i in tokenized:
print(tagged)
# Importing libraries
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time
# split data into training and validation set in the ratio 80:20
train_set,test_set
=train_test_split(nltk_data,train_size=0.80,test_size=0.20,random_state =
101)
#use set datatype to check how many unique tags are present in training
data
tags = {tag for word,tag in train_tagged_words}
print(len(tags))
print(tags)
print(tags_matrix)
AD CO NU PR NO VE AD
DE ADJ PRT . X
V NJ M ON UN RB P
T
DE 0.00 0.01 0.20 0.00 0.02 0.00 0.01 0.00 0.63 0.04 0.00 0.04
T 6037 2074 6411 0431 2855 0287 7393 3306 5906 0247 9918 5134
AD 0.07 0.08 0.13 0.00 0.02 0.01 0.13 0.01 0.03 0.33 0.11 0.02
V 1373 1458 0721 6982 9868 4740 9255 2025 2196 9022 9472 2886
AD 0.00 0.00 0.06 0.01 0.02 0.01 0.06 0.00 0.69 0.01 0.08 0.02
J 5243 5243 3301 6893 1748 1456 6019 0194 6893 1456 0583 0971
C
0.12 0.05 0.11 0.00 0.04 0.00 0.03 0.06 0.34 0.15 0.05 0.00
O
3491 7080 3611 0549 0615 4391 5126 0373 9067 0384 5982 9330
NJ
Conclusion: In conclusion, the provided Python code utilizing NLTK allows users to input text
and obtain the corresponding Part-of-Speech (POS) tags for each token, while Hidden Markov
Models (HMM) can be employed for POS tagging through the calculation of Transition
Probability and Emission Probability in a systematic and illustrative manner.
17/10/23 21/11/23
Code:
pip install python-crfsuite
import nltk
nltk.download('treebank')
import pycrfsuite
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [label for word, label in sent]
trainer.set_params({
'c1': 1.0,
'c2': 1e-3,
'max_iterations': 50,
'feature.possible_transitions': True
})
model_filename = 'pos_tagger_model.crfsuite'
trainer.train(model_filename)
# Tag a sentence
example_sentence = treebank.sents()[0]
features = sent2features(example_sentence)
tags = tagger.tag(features)
print("\nClassification Report:")
print(classification_report(y_test_flat, y_pred_flat))
This/DT is/VBZ a/DT sample/NNP sentence/NNP for/IN POS/NNP tagging./NNP
import pycrfsuite
# Sample sentence
sample_sentence = "This is a sample sentence for POS tagging."
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import treebank
from nltk.tokenize import word_tokenize
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy",
metrics=["accuracy"])
predicted_tags = model.predict(sample_sequence)
17/10/23 21/11/23
Code:
! pip install spacy
! pip install nltk
! python -m spacy download en_core_web_sm
import spacy
from spacy import displacy
from spacy import tokenizer
nlp = spacy.load('en_core_web_sm')
# tokenization
for token in doc:
print(token.text)
# print entities
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)
# now we use displaycy function on doc2
displacy.render(doc, style='ent', jupyter=True)
# function
def get_named_entity():
try:
for i in tokenized:
words = nltk.word_tokenize(i)
tagged = nltk.pos_tag(words)
namedEnt = nltk.ne_chunk(tagged, binary=False)
namedEnt.draw()
except:
pass
get_named_entity()
# tokenization
for token in doc:
print(token.text)
# print entities
ents = [(e.text, e.start_char, e.end_char, e.label_) for e in doc.ents]
print(ents)
# now we use displaycy function on doc2
displacy.render(doc, style='ent', jupyter=True)
HI
I
am
Atharva
,
I
am
from
Aurangabad
,
Maharashtra
,
India
.
Currently
I
am
persuing
B
-
tech
degree
from
Deogiri
college
[('Atharva', 8, 15, 'PERSON'), ('Aurangabad', 27, 37, 'GPE'), ('Maharashtra',
39, 50, 'GPE'), ('India', 52, 57, 'GPE'), ('Deogiri', 102, 109, 'ORG')]
HI I am Atharva PERSON , I am from Aurangabad GPE , Maharashtra GPE , India GPE . Currently I am
Conclusion: In conclusion, the provided Python code successfully performs Named Entity
Recognition (NER) on input text, accurately identifying and extracting entities such as names,
locations, and organizations.
21/11/23