m03 v01 Store Sales Prediction
m03 v01 Store Sales Prediction
1 0.0. IMPORTS
[1]: import math
import numpy as np
import pandas as pd
import inflection
plt.style.use( 'bmh' )
plt.rcParams['figure.figsize'] = [25, 12]
plt.rcParams['font.size'] = 24
sns.set()
[3]: jupyter_settings()
1
1.2 0.2. Loading data
[4]: df_sales_raw = pd.read_csv( '../data/train.csv', low_memory=False )
df_store_raw = pd.read_csv( '../data/store.csv', low_memory=False )
# merge
df_raw = pd.merge( df_sales_raw, df_store_raw, how='left', on='Store' )
# rename
df1.columns = cols_new
2
promo int64
state_holiday object
school_holiday int64
store_type object
assortment object
competition_distance float64
competition_open_since_month float64
competition_open_since_year float64
promo2 int64
promo2_since_week float64
promo2_since_year float64
promo_interval object
dtype: object
[9]: store 0
day_of_week 0
date 0
sales 0
customers 0
open 0
promo 0
state_holiday 0
school_holiday 0
store_type 0
assortment 0
competition_distance 2642
competition_open_since_month 323348
competition_open_since_year 323348
promo2 0
promo2_since_week 508031
promo2_since_year 508031
promo_interval 508031
dtype: int64
3
0 0 a a 2020.0
2.0 2011.0 1 14.0 2011.0
Mar,Jun,Sept,Dec
[11]: #competition_distance
df1['competition_distance'] = df1['competition_distance'].apply( lambda x:␣
,→200000.0 if math.isnan( x ) else x )
#competition_open_since_month
df1['competition_open_since_month'] = df1.apply( lambda x: x['date'].month if␣
,→math.isnan( x['competition_open_since_month'] ) else␣
,→x['competition_open_since_month'], axis=1 )
#competition_open_since_year
df1['competition_open_since_year'] = df1.apply( lambda x: x['date'].year if␣
,→math.isnan( x['competition_open_since_year'] ) else␣
,→x['competition_open_since_year'], axis=1 )
#promo2_since_week
df1['promo2_since_week'] = df1.apply( lambda x: x['date'].week if math.isnan(␣
,→x['promo2_since_week'] ) else x['promo2_since_week'], axis=1 )
#promo2_since_year
df1['promo2_since_year'] = df1.apply( lambda x: x['date'].year if math.isnan(␣
,→x['promo2_since_year'] ) else x['promo2_since_year'], axis=1 )
#promo_interval
month_map = {1: 'Jan', 2: 'Fev', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', ␣
,→7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec'}
df1['promo_interval'].fillna(0, inplace=True )
[12]: df1.isna().sum()
[12]: store 0
day_of_week 0
date 0
sales 0
customers 0
open 0
promo 0
4
state_holiday 0
school_holiday 0
store_type 0
assortment 0
competition_distance 0
competition_open_since_month 0
competition_open_since_year 0
promo2 0
promo2_since_week 0
promo2_since_year 0
promo_interval 0
month_map 0
is_promo 0
dtype: int64
df1['competition_open_since_year'] = df1['competition_open_since_year'].astype(␣
,→int )
# promo2
df1['promo2_since_week'] = df1['promo2_since_week'].astype( int )
df1['promo2_since_year'] = df1['promo2_since_year'].astype( int )
5
# concatenar
m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std',␣
,→'skew', 'kurtosis']
6
2.7.2 1.7.2. Categorical Atributes
[17]: state_holiday 4
store_type 4
assortment 3
promo_interval 4
month_map 12
dtype: int64
plt.subplot( 1, 3, 1 )
sns.boxplot( x='state_holiday', y='sales', data=aux )
plt.subplot( 1, 3, 2 )
sns.boxplot( x='store_type', y='sales', data=aux )
plt.subplot( 1, 3, 3 )
sns.boxplot( x='assortment', y='sales', data=aux )
7
3 2.0. PASSO 02 - FEATURE ENGINEERING
[19]: df2 = df1.copy()
8
3.2 2.2. Criacao das Hipoteses
3.2.1 2.2.1. Hipoteses Loja
1. Lojas com número maior de funcionários deveriam vender mais.
2. Lojas com maior capacidade de estoque deveriam vender mais.
3. Lojas com maior porte deveriam vender mais.
4. Lojas com maior sortimentos deveriam vender mais.
5. Lojas com competidores mais próximos deveriam vender menos.
6. Lojas com competidores à mais tempo deveriam vendem mais.
9
5. Lojas com mais dias de promoção deveriam vender mais.
7. Lojas com mais promoções consecutivas deveriam vender mais.
8. Lojas abertas durante o feriado de Natal deveriam vender mais.
9. Lojas deveriam vender mais ao longo dos anos.
10. Lojas deveriam vender mais no segundo semestre do ano.
11. Lojas deveriam vender mais depois do dia 10 de cada mês.
12. Lojas deveriam vender menos aos finais de semana.
13. Lojas deveriam vender menos durante os feriados escolares.
# month
df2['month'] = df2['date'].dt.month
# day
df2['day'] = df2['date'].dt.day
# week of year
df2['week_of_year'] = df2['date'].dt.weekofyear
# year week
df2['year_week'] = df2['date'].dt.strftime( '%Y-%W' )
# competition since
df2['competition_since'] = df2.apply( lambda x: datetime.datetime(␣
,→year=x['competition_open_since_year'],␣
,→month=x['competition_open_since_month'],day=1 ), axis=1 )
# promo since
df2['promo_since'] = df2['promo2_since_year'].astype( str ) + '-' +␣
,→df2['promo2_since_week'].astype( str )
# assortment
10
df2['assortment'] = df2['assortment'].apply( lambda x: 'basic' if x == 'a' else␣
,→'extra' if x == 'b' else 'extended' )
# state holiday
df2['state_holiday'] = df2['state_holiday'].apply( lambda x: 'public_holiday'␣
,→if x == 'a' else 'easter_holiday' if x == 'b' else 'christmas' if x == 'c'␣
,→else 'regular_day' )
11