Menu

[r4657]: / trunk / py4science / examples / scrape_key_stats.py  Maximize  Restore  History

Download this file

140 lines (120 with data), 5.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
Use urllib to download key statistics for several stock tickers; URL
example pattern
http://finance.yahoo.com/q/ks?s=INTC
"""
import datetime, time
import urllib
import os
import BeautifulSoup
def get_datadir():
"""
return the data dir used by this program. It should be
data/key_stats. use the os module to check for the directories
existence and create it if necessary. See os.exists, os.path.join
and os.mkdir
"""
if not os.path.exists('data'):
os.mkdir('data')
datadir = os.path.join('data', 'key_stats')
if not os.path.exists(datadir):
os.mkdir(datadir)
return datadir
def grab_data(tickers):
"""
download the html file for each ticker in the list of tickers and
put the output into datadir with a filename like
'GOOG_key_stats.html'. Look at urllib.urlretrieve for fetching
the file over the web, and insert a time.sleep command in between
each file grab to lower the burden on the server.
Return value is a list of (ticker,
pathname) where pathname is a path to the html file.
"""
datadir = get_datadir()
datafiles = []
for ticker in tickers:
# make an output filename be creating a file like
# "GOOG_key_stats.html" in the datadir
fname = os.path.join(datadir, ticker+'_key_stats.html')
# only download the file if it doesn't already exist. Since these
# stats can change daily, in real life we might want to put a date
# stamp on the files too
if not os.path.exists(fname):
# build the URL from the format pattern and ticker and grab it with urllib
url = 'http://finance.yahoo.com/q/ks?s=' + ticker
urllib.urlretrieve(url, fname)
print 'fetched %s into %s'%(ticker, fname)
# use time.sleep between grabs to be gentle on the server
time.sleep(0.5)
else:
print 'already have', fname
datafiles.append((ticker, fname))
return datafiles
def convert(x):
"""
The table data is all strings and we want to convert it to python
datatypes as intelligently as possible. For serios use, you would
want converters depending on the column header, but where we'll
just inspect the string and try and do something semi-intelligent.
Eg, if it ends with '%', strip the '%' and return a float. If it
ends with 'M', strip the 'M' and multipl by 1e6 (likewise for 'K'
and 'B'). Try to convert things that look like dates to
python.date objects using time.strptime and datetime.date. Try
and convert to float using a try/except block. If everything
fails, just return the string
"""
if x=='N/A': return None
elif x.endswith('%'): return float(x[:-1])
elif x.endswith('B'): return float(x[:-1])*1e9
elif x.endswith('M'): return float(x[:-1])*1e6
elif x.endswith('K'): return float(x[:-1])*1e3
else:
# try to convert to float
try: return float(x)
except ValueError: pass
# try to convert to date
try: y,m,d = time.strptime(x, '%d-%b-%y')[:3]
except ValueError: pass
else: return datetime.date(y,m,d)
return x
def parse_htmlfile(fname):
"""
parse the key statistics html in fname and return a data
dictionary. The keys are the headers, and the values are the
converted data items
"""
# beautiful soup lets you filter html tags by their properties. I
# took a peak at one of the html sources and found the tags that
# correspond to the tables, headers and dataitems we are interested
# in. Yahoo was nice enough to put "class" information in the tags
# which makes this particularly easy. We'll use the table props, the
# headerprops and the dataprops to select out just the tables and
# table elements we want
tableprops = {'class': 'yfnc_datamodoutline1'}
headerprops = {'class': 'yfnc_tablehead1'}
dataprops = {'class': 'yfnc_tabledata1'}
# create the beautiful soup instance with the html string
soup = BeautifulSoup.BeautifulSoup(file(fname).read())
datad = dict()
for table in soup('table', **tableprops): # get all the data tables
for row in table('tr'): # iterate over all rows
header = row('td', **headerprops) # look for td element with the header tags
data = row('td', **dataprops) # look for td element with the data tags
if len(header)==1 and len(data)==1: # there should be exactly one header and data item
header = str(header[0].contents[0]).strip() # convert it to a string and strip whitespace
if header.endswith(':'): header = header[:-1]
data = str(data[0].contents[0]).strip()
datad[header] = convert(data) # call our all powerful convert function
return datad
# a list of stock tickers to download and parse
tickers = 'INTC', 'MSFT', 'YHOO', 'GOOG', 'GE', 'WMT', 'CROX'
# we'll store the results in a dictionary of dictionaries. tickerd is
# keyed off the ticker and points to a data dictionary returned by parse_fname
tickerd = dict()
for ticker, datafile in grab_data(tickers):
tickerd[ticker] = parse_htmlfile(datafile)
# now let's pretty print the data for one ticker
ticker = 'INTC'
maxlen = max([len(header) for header in tickerd[ticker]])
for header, data in tickerd['INTC'].items():
print '%s: %s'%(header.ljust(maxlen), data)
Want the latest updates on software, tech news, and AI?
Get latest updates about software, tech news, and AI from SourceForge directly in your inbox once a month.