3
3
tokenization.********
import nltk
nltk.download('punkt')
doc= "An eight-time Olympic gold medalist, Bolt is the only sprinter to win Olympic
100 m and 200 m titles at three consecutive Olympics (2008, 2012, and 2016). He
also won two 4 × 100 relay gold medals."
tokens= nltk.word_tokenize(doc)
print(tokens)
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
ps = PorterStemmer()
words= ["program", "programs", "programmer", "programming", "programmers"]
for w in words:
print(w, ":" , ps.stem(w))
lemmatizer = WordNetLemmatizer()
print("rocks:", lemmatizer.lemmatize("rocks"))
import nltk
import markovify
import os
from collections import Counter
nltk.download('punkt')
content = []
for filename in os.listdir("trigram/"):
with open(f'trigram/{filename}') as f:
content.append(f.read())
corpus = []
for item in content:
corpus.extend(
[word.lower() for word in nltk.word_tokenize(item) if any(c.isalpha()for c
in word)])
ngrams = Counter(nltk.ngrams(corpus, 3))
for ngram, freq in ngrams.most_common(10):
print(f'{freq}:{ngram}')
with open('Usain.txt') as f:
text = f.read()
model = markovify.Text(text)
sent = model.make_sentence()
print(sent)
def pretty_print_probs(distribs):
print(distribs)
rows = set()
cols = set()
for val in distribs.keys():
temp = val.split("|")
rows.add(temp[0])
cols.add(temp[1])
rows = list(rows)
cols = list(cols)
df = []
for i in range(len(rows)):
temp = []
for j in range(len(cols)):
temp.append(distribs[rows[i] + "|" + cols[j]])
df.append(temp)
l = pd.Index(rows, name="rows")
C = pd.Index(cols, name="cols")
df = pd.DataFrame(data=df, index=l, columns=C)
print(tabulate(df, headers="keys", tablefmt="psql"))
def initializeSequences(_obs):
# Generate hst ot sequences
seqlen = len(_obs)
seqs = generate_sequence(states, seqlen)
# Score sequences
seq_scores = score_sequences(seqs, initial_probs, transition_probs,
emission_probs, obs)
return (seqlen, seqs, seq_scores)
states = ["Noun","Verb","Determiner"]
initial_probs = {"Noun":0.9, "Verb":0.05, "Determiner":0.05}
transition_probs = {"Noun|Noun":0.2, "Noun|Verb": 0.1, "Noun|Determiner":0.8,
"Verb|Noun":0.8,"Verb|Verb":0.1, "Verb|Determiner":0.1, "Determiner|Noun":0.1,
"Determiner|Verb":0.8, "Determiner|Determiner":0.1}
emission_probs = {"Vimal|Noun":0.9,"taught|Noun":0.05, "the|Noun": 0.05,"class|
Noun":0.9, "Vimal|Verb":0.05, "taught|Verb":0.9, "the|Verb":0.05, "class|
Verb":0.05, "Vimal|Determiner":0.05, "taught|Determiner":0.05, "the|
Determiner":0.9, "class|Determiner":0.05}
print("Initial Distributions")
print(initial_probs)
print("Transition Probabilities")
pretty_print_probs(transition_probs)
print("Emission Probabilities")
pretty_print_probs(emission_probs)
obs = ["Vimal","taught","the",'class']
print("Scores")
sequence_length, sequences, sequence_scores = initializeSequences(obs)
for i in range(len(sequences)):
print("Sequence:%-60s Score:%0.6f" % (sequences[i],sequence_scores[i]))
print("Best Sequence")
print(sequences[sequence_scores.index(max(sequence_scores))], max(sequence_scores))
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import pos_tag, word_tokenize, RegexpParser
sample_text = "Usain Bolt is the only sprinter to win Olympic 100 m and 200 m
titles at three consecutive Olympics (2008, 2012, and 2016)."
tagged = pos_tag(word_tokenize(sample_text))
print(tagged)
grammar= r"""
NP: {<DT>?<JJ>*<NN>} #To e xtract Noun Phroscs
P: {<IN>} #To extract Verbs
PP: {<p> <NP>} #To extract Prc1>0s1t1onal Phrases
VP: {<V> <NP|PP>*} #To extract Verb Phrases
"""
chunker = RegexpParser(grammar)
output = chunker.parse(tagged)
print(" After Extracting\n", output)
output.draw()
*********************Practical – 6 Write a program to implement a dependency
parsing of a text.************
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
sentence = "Usain Bolt is the fastest sprinter in the world."
doc=nlp(sentence)
print ("{:<15} | {:<8} | {:<15} | {:<20}".format('Token','Relatlon','Head',
'Children'))
print("-"*70 )
for token in doc:
print("{:<15} | {:<8} | {:<15} | {:<20}"
.format(str(token.text), str(token.dep_), str(token.head.text), str([child for
child in token.children])))
displacy.render(doc,style='dep',jupyter=True,options={'distance':120})
import spacy
nlp = spacy.load("en_core_web_sm")
sentence = "World's Fastest sprinter Usain Bolt has the record for 100m in 9.58s"
doc = nlp(sentence)
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
nltk.download('stopwords')
def read_artlcle(file_name):
file = open(file_name, "r")
filedata = file.readlines()
article = filedata[0].split(". ")
sentences = []
for sentence in article:
print(sentence)
sentences.append(sentence.replace("[^a-zA-Z]"," ").split(" "))
sentences.pop()
return sentences
def build_similarity_matrix(sentences, stop_words):
similarity_matrix = np.zeros((len(sentences),len(sentences)))
for idx1 in range(len(sentences)):
for idx2 in range(len(sentences)):
if idx1 == idx2:
continue
similarity_matrix[idx1][idx2] =
sentence_similarity(sentences[idx1],sentences[idx2],stop_words)
return similarity_matrix
def sentence_similarity(sent1, sent2, stopwords=None):
if stopwords is None:
stopwords = []
sent1 = [w.lower() for w in sent1]
sent2 = [w.lower() for w in sent2]
all_words = list(set(sent1 + sent2))
vector1 = [0]*len(all_words)
vector2 = [0]*len(all_words)
for w in sent1:
if w in stopwords:
continue
vector1[all_words.index(w)] += 1
for w in sent2:
if w in stopwords:
continue
vector2[all_words.index(w)] += 1
return 1 - cosine_distance(vector1, vector2)
def generate_summary(file_name, top_n=5):
stop_words = stopwords.words('english')
summarize_text = []
sentences = read_artlcle(file_name)
sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)
sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
scores = nx.pagerank(sentence_similarity_graph)
ranked_sentence = sorted(((scores[i],s) for i,s in enumerate(sentences)),
reverse=True)
print("Indexes of top ranked_sentence order are", ranked_sentence)
for i in range(top_n):
summarize_text.append(" ".join(ranked_sentence[i][1]))
print("Summarize Text: \n", " . ".join(summarize_text))
generate_summary('Usain.txt',2)