0% found this document useful (0 votes)
21 views5 pages

Data Science 1: Assignment No. 2 Date: Sept 26, 2016

This document is an assignment for a data science course. It contains Python code to analyze a text document from Project Gutenberg. The code downloads the text, removes HTML tags and punctuation, stems the words, removes common stopwords, counts word frequencies, and plots the results in a histogram. The code also times how long the analysis takes to run.

Uploaded by

Ashish
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
21 views5 pages

Data Science 1: Assignment No. 2 Date: Sept 26, 2016

This document is an assignment for a data science course. It contains Python code to analyze a text document from Project Gutenberg. The code downloads the text, removes HTML tags and punctuation, stems the words, removes common stopwords, counts word frequencies, and plots the results in a histogram. The code also times how long the analysis takes to run.

Uploaded by

Ashish
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 5

Data Science 1: Assignment No.

2
Date: Sept 26, 2016

By,
Ashish Menkudale
UIN: 656130575
[email protected]

import timeit
import numpy as np
import pandas as pd
import bs4
import requests
from bs4 import BeautifulSoup

start = timeit.default_timer()
#timer started

import urllib2
data = urllib2.urlopen("https://archive.org/stream/ataleoftwocities00098gut/98.txt")
l = []
str = ''
for line in data.readlines():
l.append(line)
str = '\n'.join(l)

print str
# got all the text here

import lxml.html
import re, htmlentitydefs
filtered_str = re.sub('<[^<]+?>', '', str)
print filtered_str
# cleared html tags

import re

removed_punct = re.sub(r'[^\w\s]','',filtered_str)
print removed_punct
#removed punctuation over here
stopwords = ['had', 'has' ,'your' ,'you' ,'with' ,'i' ,'his', 'she', 'he', 'are' ,
'not' ,'the' ,'a','was','an','and','of','at','on','over','under','to',
'from','what','if','else','also','in','is','it','by','this','that','his',
'have','be', 'as', 'were', 'for', 'so', 'him', 'her', 'but', 'she', 'or',
'no', 'will', 'my', 'up', 'its', 'there', 'away', 'me', 'we' , 'they', 'only',
'too', 'down', 'upon', 'into', 'their', 'here', 'could', 'would', 'been',
'after', 'us','1','2','3','4','5','6','7','8','9','0']
querywords = removed_punct.split()
resultwords = [word for word in querywords if word.lower() not in stopwords]
result = ' '.join(resultwords)
print result
#removed the common occurrences

list = reduce(lambda d, c: d.update([(c, d.get(c,0)+1)]) or d, result.split(), {})


sorted_list = list.items()
sorted_list.sort(key = lambda item: item[1])
for word in sorted_list:
print word
# got the frequency and sorted it over here

wordList = re.sub("[^\w]", " ", result).split()


print wordlist
# changed the datatype over here

from collections import Counter


import numpy as np

import matplotlib.pyplot as plt


word_counts = Counter(wordList)
def plot_bar_from_counter(counter, ax=None):

if ax is None:
fig = plt.figure()
ax = fig.add_subplot(111)
frequencies = counter.values()
names = counter.keys()
x_coordinates = np.arange(len(counter))
ax.bar(x_coordinates, frequencies, align='center')
ax.xaxis.set_major_locator(plt.FixedLocator(x_coordinates))
ax.xaxis.set_major_formatter(plt.FixedFormatter(names))
return ax

plot_bar_from_counter(word_counts)
plt.show()
# plotted histogram

print timeit.default_timer()-start
# got the time

6.86172139321

You might also like