forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathframe_ctor.py
100 lines (82 loc) · 3.03 KB
/
frame_ctor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from vbench.benchmark import Benchmark
from datetime import datetime
try:
import pandas.tseries.offsets as offsets
except:
import pandas.core.datetools as offsets
common_setup = """from pandas_vb_common import *
try:
from pandas.tseries.offsets import *
except:
from pandas.core.datetools import *
"""
#----------------------------------------------------------------------
# Creation from nested dict
setup = common_setup + """
N, K = 5000, 50
index = tm.makeStringIndex(N)
columns = tm.makeStringIndex(K)
frame = DataFrame(np.random.randn(N, K), index=index, columns=columns)
try:
data = frame.to_dict()
except:
data = frame.toDict()
some_dict = data.values()[0]
dict_list = [dict(zip(columns, row)) for row in frame.values]
"""
frame_ctor_nested_dict = Benchmark("DataFrame(data)", setup)
# From JSON-like stuff
frame_ctor_list_of_dict = Benchmark("DataFrame(dict_list)", setup,
start_date=datetime(2011, 12, 20))
series_ctor_from_dict = Benchmark("Series(some_dict)", setup)
# nested dict, integer indexes, regression described in #621
setup = common_setup + """
data = dict((i,dict((j,float(j)) for j in xrange(100))) for i in xrange(2000))
"""
frame_ctor_nested_dict_int64 = Benchmark("DataFrame(data)", setup)
# dynamically generate benchmarks for every offset
#
# get_period_count & get_index_for_offset are there because blindly taking each
# offset times 1000 can easily go out of Timestamp bounds and raise errors.
dynamic_benchmarks = {}
n_steps = [1, 2]
for offset in offsets.__all__:
for n in n_steps:
setup = common_setup + """
def get_period_count(start_date, off):
ten_offsets_in_days = ((start_date + off * 10) - start_date).days
if ten_offsets_in_days == 0:
return 1000
else:
return min(9 * ((Timestamp.max - start_date).days //
ten_offsets_in_days),
1000)
def get_index_for_offset(off):
start_date = Timestamp('1/1/1900')
return date_range(start_date,
periods=min(1000, get_period_count(start_date, off)),
freq=off)
idx = get_index_for_offset({}({}))
df = DataFrame(np.random.randn(len(idx),10), index=idx)
d = dict([ (col,df[col]) for col in df.columns ])
""".format(offset, n)
key = 'frame_ctor_dtindex_{}x{}'.format(offset, n)
dynamic_benchmarks[key] = Benchmark("DataFrame(d)", setup, name=key)
# Have to stuff them in globals() so vbench detects them
globals().update(dynamic_benchmarks)
# from a mi-series
setup = common_setup + """
mi = MultiIndex.from_tuples([(x,y) for x in range(100) for y in range(100)])
s = Series(randn(10000), index=mi)
"""
frame_from_series = Benchmark("DataFrame(s)", setup)
#----------------------------------------------------------------------
# get_numeric_data
setup = common_setup + """
df = DataFrame(randn(10000, 25))
df['foo'] = 'bar'
df['bar'] = 'baz'
df = df.consolidate()
"""
frame_get_numeric_data = Benchmark('df._get_numeric_data()', setup,
start_date=datetime(2011, 11, 1))