0% found this document useful (0 votes)
16 views5 pages

3

Uploaded by

shifaansari1975
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views5 pages

3

Uploaded by

shifaansari1975
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

*********Practical – 1 Write a program to implement sentence and word

tokenization.********

import nltk
nltk.download('punkt')
doc= "An eight-time Olympic gold medalist, Bolt is the only sprinter to win Olympic
100 m and 200 m titles at three consecutive Olympics (2008, 2012, and 2016). He
also won two 4 × 100 relay gold medals."
tokens= nltk.word_tokenize(doc)
print(tokens)

*****************Practical – 2 Write a program to implement stemming and


lemmatization.*****************

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
ps = PorterStemmer()
words= ["program", "programs", "programmer", "programming", "programmers"]
for w in words:
print(w, ":" , ps.stem(w))
lemmatizer = WordNetLemmatizer()
print("rocks:", lemmatizer.lemmatize("rocks"))

**************************Practical – 3 Write a program to implement a tri-gram


model.********************

import nltk
import markovify
import os
from collections import Counter
nltk.download('punkt')
content = []
for filename in os.listdir("trigram/"):
with open(f'trigram/{filename}') as f:
content.append(f.read())
corpus = []
for item in content:
corpus.extend(
[word.lower() for word in nltk.word_tokenize(item) if any(c.isalpha()for c
in word)])
ngrams = Counter(nltk.ngrams(corpus, 3))
for ngram, freq in ngrams.most_common(10):
print(f'{freq}:{ngram}')

with open('Usain.txt') as f:
text = f.read()
model = markovify.Text(text)
sent = model.make_sentence()
print(sent)

********************************Practical – 4 Write a program to implement PoS


tagging using HMM & Neural Model.*********************

from tabulate import tabulate


import pandas as pd
def generate_sequence(states, sequence_length):
all_sequences = []
nodes= []
depth= sequence_length
def gen_seq_recur(states, nodes, depth):
if depth == 0:
# print nodes
all_sequences.append(nodes)
else:
for state in states:
temp_nodes = list(nodes)
temp_nodes.append(state)
gen_seq_recur(states, temp_nodes, depth - 1)
gen_seq_recur(states, [], depth)
return all_sequences

def score_sequences(sequences, initial_probs, transition_probs, emission_probs,


obs):
best_score = -1
best_sequence = None
sequence_scores = []
for seq in sequences:
total_score = 1
first= True
for i in range(len(seq)):
state_score= 1
if(first == True):
state_score *= initial_probs[seq[i]]
first=False
else:
state_score = transition_probs[seq[i] + "|" + seq[i-1]]
state_score = emission_probs[obs[i] + "|" +seq[i]]
total_score *= state_score
sequence_scores.append(total_score)
return sequence_scores

def pretty_print_probs(distribs):
print(distribs)
rows = set()
cols = set()
for val in distribs.keys():
temp = val.split("|")
rows.add(temp[0])
cols.add(temp[1])
rows = list(rows)
cols = list(cols)
df = []
for i in range(len(rows)):
temp = []
for j in range(len(cols)):
temp.append(distribs[rows[i] + "|" + cols[j]])
df.append(temp)
l = pd.Index(rows, name="rows")
C = pd.Index(cols, name="cols")
df = pd.DataFrame(data=df, index=l, columns=C)
print(tabulate(df, headers="keys", tablefmt="psql"))

def initializeSequences(_obs):
# Generate hst ot sequences
seqlen = len(_obs)
seqs = generate_sequence(states, seqlen)
# Score sequences
seq_scores = score_sequences(seqs, initial_probs, transition_probs,
emission_probs, obs)
return (seqlen, seqs, seq_scores)
states = ["Noun","Verb","Determiner"]
initial_probs = {"Noun":0.9, "Verb":0.05, "Determiner":0.05}
transition_probs = {"Noun|Noun":0.2, "Noun|Verb": 0.1, "Noun|Determiner":0.8,
"Verb|Noun":0.8,"Verb|Verb":0.1, "Verb|Determiner":0.1, "Determiner|Noun":0.1,
"Determiner|Verb":0.8, "Determiner|Determiner":0.1}
emission_probs = {"Vimal|Noun":0.9,"taught|Noun":0.05, "the|Noun": 0.05,"class|
Noun":0.9, "Vimal|Verb":0.05, "taught|Verb":0.9, "the|Verb":0.05, "class|
Verb":0.05, "Vimal|Determiner":0.05, "taught|Determiner":0.05, "the|
Determiner":0.9, "class|Determiner":0.05}
print("Initial Distributions")
print(initial_probs)

print("Transition Probabilities")
pretty_print_probs(transition_probs)

print("Emission Probabilities")
pretty_print_probs(emission_probs)

obs = ["Vimal","taught","the",'class']
print("Scores")
sequence_length, sequences, sequence_scores = initializeSequences(obs)
for i in range(len(sequences)):
print("Sequence:%-60s Score:%0.6f" % (sequences[i],sequence_scores[i]))
print("Best Sequence")
print(sequences[sequence_scores.index(max(sequence_scores))], max(sequence_scores))

******************Practical – 5 Write a program to implement syntactic parsing of a


given text.**************

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, word_tokenize, RegexpParser
sample_text = "Usain Bolt is the only sprinter to win Olympic 100 m and 200 m
titles at three consecutive Olympics (2008, 2012, and 2016)."
tagged = pos_tag(word_tokenize(sample_text))
print(tagged)
grammar= r"""
NP: {<DT>?<JJ>*<NN>} #To e xtract Noun Phroscs
P: {<IN>} #To extract Verbs
PP: {<p> <NP>} #To extract Prc1>0s1t1onal Phrases
VP: {<V> <NP|PP>*} #To extract Verb Phrases
"""
chunker = RegexpParser(grammar)
output = chunker.parse(tagged)
print(" After Extracting\n", output)
output.draw()
*********************Practical – 6 Write a program to implement a dependency
parsing of a text.************

import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
sentence = "Usain Bolt is the fastest sprinter in the world."
doc=nlp(sentence)
print ("{:<15} | {:<8} | {:<15} | {:<20}".format('Token','Relatlon','Head',
'Children'))
print("-"*70 )
for token in doc:
print("{:<15} | {:<8} | {:<15} | {:<20}"
.format(str(token.text), str(token.dep_), str(token.head.text), str([child for
child in token.children])))
displacy.render(doc,style='dep',jupyter=True,options={'distance':120})

********************************* Practical – 7 Write a program to implement a


Named Entity Recognition (NER).************

import spacy
nlp = spacy.load("en_core_web_sm")
sentence = "World's Fastest sprinter Usain Bolt has the record for 100m in 9.58s"
doc = nlp(sentence)
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)

****************************Practical – 8 Write a program to implement Text


Summarization for given text.***********

import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
nltk.download('stopwords')
def read_artlcle(file_name):
file = open(file_name, "r")
filedata = file.readlines()
article = filedata[0].split(". ")
sentences = []
for sentence in article:
print(sentence)
sentences.append(sentence.replace("[^a-zA-Z]"," ").split(" "))
sentences.pop()
return sentences
def build_similarity_matrix(sentences, stop_words):
similarity_matrix = np.zeros((len(sentences),len(sentences)))
for idx1 in range(len(sentences)):
for idx2 in range(len(sentences)):
if idx1 == idx2:
continue
similarity_matrix[idx1][idx2] =
sentence_similarity(sentences[idx1],sentences[idx2],stop_words)
return similarity_matrix
def sentence_similarity(sent1, sent2, stopwords=None):
if stopwords is None:
stopwords = []
sent1 = [w.lower() for w in sent1]
sent2 = [w.lower() for w in sent2]
all_words = list(set(sent1 + sent2))
vector1 = [0]*len(all_words)
vector2 = [0]*len(all_words)
for w in sent1:
if w in stopwords:
continue
vector1[all_words.index(w)] += 1
for w in sent2:
if w in stopwords:
continue
vector2[all_words.index(w)] += 1
return 1 - cosine_distance(vector1, vector2)
def generate_summary(file_name, top_n=5):
stop_words = stopwords.words('english')
summarize_text = []
sentences = read_artlcle(file_name)
sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)),
reverse=True)
print("Indexes of top ranked_sentence order are", ranked_sentence)
for i in range(top_n):
summarize_text.append(" ".join(ranked_sentence[i][1]))
print("Summarize Text: \n", " . ".join(summarize_text))
generate_summary('Usain.txt',2)

You might also like