forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjoin_merge.py
96 lines (77 loc) · 3.12 KB
/
join_merge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from vbench.benchmark import Benchmark
from datetime import datetime
common_setup = """from pandas_vb_common import *
"""
setup = common_setup + """
level1 = np.array([rands(10) for _ in xrange(10)], dtype='O')
level2 = np.array([rands(10) for _ in xrange(1000)], dtype='O')
label1 = np.arange(10).repeat(1000)
label2 = np.tile(np.arange(1000), 10)
key1 = np.tile(level1.take(label1), 10)
key2 = np.tile(level2.take(label2), 10)
shuf = np.arange(100000)
random.shuffle(shuf)
try:
index2 = MultiIndex(levels=[level1, level2], labels=[label1, label2])
index3 = MultiIndex(levels=[np.arange(10), np.arange(100), np.arange(100)],
labels=[np.arange(10).repeat(10000),
np.tile(np.arange(100).repeat(100), 10),
np.tile(np.tile(np.arange(100), 100), 10)])
df_multi = DataFrame(np.random.randn(len(index2), 4), index=index2,
columns=['A', 'B', 'C', 'D'])
except: # pre-MultiIndex
pass
try:
DataFrame = DataMatrix
except:
pass
df = DataFrame({'data1' : np.random.randn(100000),
'data2' : np.random.randn(100000),
'key1' : key1,
'key2' : key2})
df_key1 = DataFrame(np.random.randn(len(level1), 4), index=level1,
columns=['A', 'B', 'C', 'D'])
df_key2 = DataFrame(np.random.randn(len(level2), 4), index=level2,
columns=['A', 'B', 'C', 'D'])
"""
#----------------------------------------------------------------------
# DataFrame joins on key
join_dataframe_index_single_key_small = \
Benchmark("df.join(df_key1, on='key1')", setup,
name='join_dataframe_index_single_key_small')
join_dataframe_index_single_key_bigger = \
Benchmark("df.join(df_key2, on='key2')", setup,
name='join_dataframe_index_single_key_bigger')
join_dataframe_index_multi = \
Benchmark("df.join(df_multi, on=['key1', 'key2'])", setup,
name='join_dataframe_index_multi',
start_date=datetime(2011, 10, 20))
#----------------------------------------------------------------------
# DataFrame joins on index
#----------------------------------------------------------------------
# Merges
#----------------------------------------------------------------------
# data alignment
setup = common_setup + """n = 1000000
# indices = Index([rands(10) for _ in xrange(n)])
def sample(values, k):
sampler = np.random.permutation(len(values))
return values.take(sampler[:k])
sz = 500000
rng = np.arange(0, 10000000000000, 10000000)
stamps = np.datetime64(datetime.now()).view('i8') + rng
idx1 = np.sort(sample(stamps, sz))
idx2 = np.sort(sample(stamps, sz))
ts1 = Series(np.random.randn(sz), idx1)
ts2 = Series(np.random.randn(sz), idx2)
"""
stmt = "ts1 + ts2"
series_align_int64_index = \
Benchmark(stmt, setup,
name="series_align_int64_index",
start_date=datetime(2010, 6, 1), logy=True)
stmt = "ts1.align(ts2, join='left')"
series_align_left_monotonic = \
Benchmark(stmt, setup,
name="series_align_left_monotonic",
start_date=datetime(2011, 3, 1), logy=True)