from vbench.api import Benchmark
from datetime import datetime

common_setup = """from pandas_vb_common import *
"""

setup = common_setup + """
N = 100000
ngroups = 100

def get_test_data(ngroups=100, n=N):
    unique_groups = range(ngroups)
    arr = np.asarray(np.tile(unique_groups, n / ngroups), dtype=object)

    if len(arr) < n:
        arr = np.asarray(list(arr) + unique_groups[:n - len(arr)],
                         dtype=object)

    random.shuffle(arr)
    return arr

# aggregate multiple columns
df = DataFrame({'key1' : get_test_data(ngroups=ngroups),
                'key2' : get_test_data(ngroups=ngroups),
                'data1' : np.random.randn(N),
                'data2' : np.random.randn(N)})
def f():
    df.groupby(['key1', 'key2']).agg(lambda x: x.values.sum())

simple_series = Series(np.random.randn(N))
key1 = df['key1']
"""

stmt1 = "df.groupby(['key1', 'key2'])['data1'].agg(lambda x: x.values.sum())"
groupby_multi_python = Benchmark(stmt1, setup,
                                 start_date=datetime(2011, 7, 1))

stmt3 = "df.groupby(['key1', 'key2']).sum()"
groupby_multi_cython = Benchmark(stmt3, setup,
                                 start_date=datetime(2011, 7, 1))

stmt = "df.groupby(['key1', 'key2'])['data1'].agg(np.std)"
groupby_multi_series_op = Benchmark(stmt, setup,
                                    start_date=datetime(2011, 8, 1))

groupby_series_simple_cython = \
    Benchmark('simple_series.groupby(key1).sum()', setup,
              start_date=datetime(2011, 3, 1))

#----------------------------------------------------------------------
# 2d grouping, aggregate many columns

setup = common_setup + """
labels = np.random.randint(0, 100, size=1000)
df = DataFrame(randn(1000, 1000))
"""

groupby_frame_cython_many_columns = Benchmark(
    'df.groupby(labels).sum()', setup,
    start_date=datetime(2011, 8, 1),
    logy=True)

#----------------------------------------------------------------------
# single key, long, integer key

setup = common_setup + """
data = np.random.randn(100000, 1)
labels = np.random.randint(0, 1000, size=100000)
df = DataFrame(data)
"""

groupby_frame_singlekey_integer = \
    Benchmark('df.groupby(labels).sum()', setup,
              start_date=datetime(2011, 8, 1), logy=True)

#----------------------------------------------------------------------
# group with different functions per column

setup = common_setup + """
fac1 = np.array(['A', 'B', 'C'], dtype='O')
fac2 = np.array(['one', 'two'], dtype='O')

df = DataFrame({'key1': fac1.take(np.random.randint(0, 3, size=100000)),
                'key2': fac2.take(np.random.randint(0, 2, size=100000)),
                'value1' : np.random.randn(100000),
                'value2' : np.random.randn(100000),
                'value3' : np.random.randn(100000)})
"""

groupby_multi_different_functions = \
    Benchmark("""df.groupby(['key1', 'key2']).agg({'value1' : 'mean',
                                                   'value2' : 'var',
                                                   'value3' : 'sum'})""",
              setup, start_date=datetime(2011, 9, 1))

groupby_multi_different_numpy_functions = \
    Benchmark("""df.groupby(['key1', 'key2']).agg({'value1' : np.mean,
                                                   'value2' : np.var,
                                                   'value3' : np.sum})""",
              setup, start_date=datetime(2011, 9, 1))

#----------------------------------------------------------------------
# size() speed

setup = common_setup + """
df = DataFrame({'key1': np.random.randint(0, 500, size=100000),
                'key2': np.random.randint(0, 100, size=100000),
                'value1' : np.random.randn(100000),
                'value2' : np.random.randn(100000),
                'value3' : np.random.randn(100000)})
"""

groupby_multi_size = Benchmark("df.groupby(['key1', 'key2']).size()",
                               setup, start_date=datetime(2011, 10, 1))

#----------------------------------------------------------------------
# Series.value_counts

setup = common_setup + """
s = Series(np.random.randint(0, 1000, size=100000))
"""

series_value_counts_int64 = Benchmark('s.value_counts()', setup,
                                      start_date=datetime(2011, 10, 21))

# value_counts on lots of strings

setup = common_setup + """
K = 1000
N = 100000
uniques = np.array([rands(10) for x in xrange(K)], dtype='O')
s = Series(np.tile(uniques, N // K))
"""

series_value_counts_strings = Benchmark('s.value_counts()', setup,
                                        start_date=datetime(2011, 10, 21))

#----------------------------------------------------------------------
# pivot_table

setup = common_setup + """
fac1 = np.array(['A', 'B', 'C'], dtype='O')
fac2 = np.array(['one', 'two'], dtype='O')

ind1 = np.random.randint(0, 3, size=100000)
ind2 = np.random.randint(0, 2, size=100000)

df = DataFrame({'key1': fac1.take(ind1),
                'key2': fac2.take(ind2),
                'key3': fac2.take(ind2),
                'value1' : np.random.randn(100000),
                'value2' : np.random.randn(100000),
                'value3' : np.random.randn(100000)})
"""

stmt = "df.pivot_table(rows='key1', cols=['key2', 'key3'])"
groupby_pivot_table = Benchmark(stmt, setup, start_date=datetime(2011, 12, 15))


#----------------------------------------------------------------------
# dict return values

setup = common_setup + """
labels = np.arange(1000).repeat(10)
data = Series(randn(len(labels)))
f = lambda x: {'first': x.values[0], 'last': x.values[-1]}
"""

groupby_apply_dict_return = Benchmark('data.groupby(labels).apply(f)',
                                      setup, start_date=datetime(2011, 12, 15))

#----------------------------------------------------------------------
# First / last functions

setup = common_setup + """
labels = np.arange(10000).repeat(10)
data = Series(randn(len(labels)))
data[::3] = np.nan
data[1::3] = np.nan
data2 = Series(randn(len(labels)),dtype='float32')
data2[::3] = np.nan
data2[1::3] = np.nan
labels = labels.take(np.random.permutation(len(labels)))
"""

groupby_first = Benchmark('data.groupby(labels).first()', setup,
                          start_date=datetime(2012, 5, 1))

groupby_first_float32 = Benchmark('data2.groupby(labels).first()', setup,
                          start_date=datetime(2013, 1, 1))

groupby_last = Benchmark('data.groupby(labels).last()', setup,
                         start_date=datetime(2012, 5, 1))

groupby_last_float32 = Benchmark('data2.groupby(labels).last()', setup,
                         start_date=datetime(2013, 1, 1))


#----------------------------------------------------------------------
# groupby_indices replacement, chop up Series

setup = common_setup + """
try:
    rng = date_range('1/1/2000', '12/31/2005', freq='H')
    year, month, day = rng.year, rng.month, rng.day
except:
    rng = date_range('1/1/2000', '12/31/2000', offset=datetools.Hour())
    year = rng.map(lambda x: x.year)
    month = rng.map(lambda x: x.month)
    day = rng.map(lambda x: x.day)

ts = Series(np.random.randn(len(rng)), index=rng)
"""

groupby_indices = Benchmark('len(ts.groupby([year, month, day]))',
                            setup, start_date=datetime(2012, 1, 1))

#----------------------------------------------------------------------
# median

#----------------------------------------------------------------------
# single key, long, integer key

setup = common_setup + """
data = np.random.randn(100000, 2)
labels = np.random.randint(0, 1000, size=100000)
df = DataFrame(data)
"""

groupby_frame_median = \
    Benchmark('df.groupby(labels).median()', setup,
              start_date=datetime(2011, 8, 1), logy=True)


setup = common_setup + """
data = np.random.randn(1000000, 2)
labels = np.random.randint(0, 1000, size=1000000)
df = DataFrame(data)
"""

groupby_simple_compress_timing = \
    Benchmark('df.groupby(labels).mean()', setup,
              start_date=datetime(2011, 8, 1))


#----------------------------------------------------------------------
# DataFrame Apply overhead

setup = common_setup + """
N = 10000
labels = np.random.randint(0, 2000, size=N)
labels2 = np.random.randint(0, 3, size=N)
df = DataFrame({'key': labels,
                'key2': labels2,
                'value1': randn(N),
                'value2': ['foo', 'bar', 'baz', 'qux'] * (N / 4)})
def f(g):
    return 1
"""

groupby_frame_apply_overhead = Benchmark("df.groupby('key').apply(f)", setup,
                                         start_date=datetime(2011, 10, 1))

groupbym_frame_apply = Benchmark("df.groupby(['key', 'key2']).apply(f)", setup,
                                 start_date=datetime(2011, 10, 1))

#----------------------------------------------------------------------
# Sum booleans #2692

setup = common_setup + """
N = 500
df = DataFrame({'ii':range(N),'bb':[True for x in range(N)]})
"""

groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup)

#----------------------------------------------------------------------
# Transform testing

setup = common_setup + """
n_dates = 1000
n_securities = 500
n_columns = 3
share_na = 0.1

dates = date_range('1997-12-31', periods=n_dates, freq='B')
dates = Index(map(lambda x: x.year * 10000 + x.month * 100 + x.day, dates))

secid_min = int('10000000', 16)
secid_max = int('F0000000', 16)
step = (secid_max - secid_min) // (n_securities - 1)
security_ids = map(lambda x: hex(x)[2:10].upper(), range(secid_min, secid_max + 1, step))

data_index = MultiIndex(levels=[dates.values, security_ids],
    labels=[[i for i in xrange(n_dates) for _ in xrange(n_securities)], range(n_securities) * n_dates],
    names=['date', 'security_id'])
n_data = len(data_index)

columns = Index(['factor{}'.format(i) for i in xrange(1, n_columns + 1)])

data = DataFrame(np.random.randn(n_data, n_columns), index=data_index, columns=columns)

step = int(n_data * share_na)
for column_index in xrange(n_columns):
    index = column_index
    while index < n_data:
        data.set_value(data_index[index], columns[column_index], np.nan)
        index += step

f_fillna = lambda x: x.fillna(method='pad')
"""

groupby_transform = Benchmark("data.groupby(level='security_id').transform(f_fillna)", setup)