0% found this document useful (0 votes)
92 views16 pages

'Whitegrid': # PLT - Style.use ("Dark - Background")

The document discusses preparing time series sales data for modeling. It loads data, creates lag features, examines correlations between sales and lagged sales, performs periodogram analysis to identify periodic patterns, adds Fourier terms to capture seasonality, and performs k-means clustering on the data.

Uploaded by

Ravi K
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
92 views16 pages

'Whitegrid': # PLT - Style.use ("Dark - Background")

The document discusses preparing time series sales data for modeling. It loads data, creates lag features, examines correlations between sales and lagged sales, performs periodogram analysis to identify periodic patterns, adds Fourier terms to capture seasonality, and performs k-means clustering on the data.

Uploaded by

Ravi K
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 16

In 

[1]:

1 import numpy as np
2 import pandas as pd
3 import matplotlib.pyplot as plt
4 import seaborn as sns
5 import plotly.express as px
6 import plotly.graph_objs as go
7 from plotly.subplots import make_subplots
8 import math
9 import datetime
10 sns.set()
11 sns.set_style('whitegrid')
12 # plt.style.use("dark_background")
13 import warnings
14 warnings.filterwarnings('ignore')
15 pd.set_option('display.max_columns', 500)
16 from xgboost import XGBRegressor
17 from xgboost import plot_importance
18 from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
19 from scipy.signal import periodogram
20 ​
21 ​
22 def plot_features(booster, figsize):
23 fig, ax = plt.subplots(1,1,figsize=figsize)
24 return plot_importance(booster=booster, ax=ax)
25 ​
26 import time
27 from itertools import product
28 %matplotlib inline
29 import os
30 import matplotlib.style as style
31 import matplotlib.gridspec as gridspec
32 import scipy.stats as stats
33 from scipy.special import boxcox1p
34 from scipy.stats import boxcox_normmax
35 from sklearn.model_selection import GridSearchCV
36 from sklearn.linear_model import Ridge
37 from sklearn.ensemble import RandomForestRegressor
38 from sklearn.neighbors import KNeighborsRegressor

---------------------------------------------------------------------------

ModuleNotFoundError Traceback (most recent call last)

<ipython-input-1-7807e2964a22> in <module>()

16 from xgboost import XGBRegressor

17 from xgboost import plot_importance

---> 18 from statsmodels.tsa.deterministic import CalendarFourier, Determini


sticProcess

19 from scipy.signal import periodogram

20

ModuleNotFoundError: No module named 'statsmodels.tsa.deterministic'

In [2]:

1 d1= {'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.9229180


2 l1= d1.keys()
3 len(l1)

Out[2]:

40

In [3]:

1 d2= {'aailiyah': 6.922918004572872, 'abandoned': 6.922918004572872, 'abroad': 6.9229180


2 l2= d2.keys()
3 len(l2)

Out[3]:

50

In [6]:

1 a,b,c= 2,4,8
2 print(b/a+c)
3 print(b/(a+c))

10.0

0.4

In [ ]:

1 ​

In [4]:

1 set(l2)- set(l1)

Out[4]:

{'accurate',

'actions',

'add',

'aged',

'ages',

'agree',

'akin',

'allow',

'amaze',

'angel'}
In [2]:

1 for dirname, _, filenames in os.walk('D:\AppliedAI\Python_Scripts\Kaggle_Data'):


2 for filename in filenames:
3 #print(os.path.join( filename))
4 if (filename.find('train') != -1):
5 train=pd.read_csv(os.path.join(dirname, filename))
6
7 elif (filename.find('test') != -1):
8 test=pd.read_csv(os.path.join(dirname, filename))
9 else:
10 submission=pd.read_csv(os.path.join(dirname, filename))

In [12]:

1 test['num_sold']=0
2 test.head()

Out[12]:

row_id date country store product num_sold

0 26298 2019-01-01 Finland KaggleMart Kaggle Mug 0

1 26299 2019-01-01 Finland KaggleMart Kaggle Hat 0

2 26300 2019-01-01 Finland KaggleMart Kaggle Sticker 0

3 26301 2019-01-01 Finland KaggleRama Kaggle Mug 0

4 26302 2019-01-01 Finland KaggleRama Kaggle Hat 0

In [17]:

1 matrix=pd.concat([train[['date','num_sold']],test[['date','num_sold']]],ignore_index=Tr

Lag Features
In [19]:

1 df = matrix.reset_index()
2 df = df.set_index(['date'])
3 entries_perday = len(df.loc['2015-01-08'])
4 df = df.reset_index().set_index('date')
5 df.drop('index',axis=1,inplace=True)
6 print('Number of entries in a day across all stores : {}'.format(entries_perday))
7 df.head()

Number of entries in a day across all stores : 18

Out[19]:

num_sold

date

2015-01-01 329

2015-01-01 520

2015-01-01 146

2015-01-01 572

2015-01-01 911

In [21]:

1 for lag in range(1, 11):


2 df['num_sold_lag' + str(lag)] = df['num_sold'].shift(18 * lag)
3 df.head(5)

Out[21]:

num_sold num_sold_lag1 num_sold_lag2 num_sold_lag3 num_sold_lag4 num_sold_lag

date

2015-
329 NaN NaN NaN NaN Na
01-01

2015-
520 NaN NaN NaN NaN Na
01-01

2015-
146 NaN NaN NaN NaN Na
01-01

2015-
572 NaN NaN NaN NaN Na
01-01

2015-
911 NaN NaN NaN NaN Na
01-01
In [22]:

1 plt.figure(figsize = (16, 8))


2 columns = [col for col in df.columns if 'num_sold' in col]
3 sns.heatmap(data = df[df.index<'2019-01-01'][columns].corr(), square = True,
4 annot = True, cmap = "Reds", vmax=1, vmin=.83, fmt = ".3f")
5 plt.xticks(rotation = 40)
6 plt.title('Correlation - sales vs lagged sales', fontsize = 18);

Scatter Plot
In [24]:

1 fig, axs = plt.subplots(2, 5, figsize = (15, 6))


2 axs = axs.flatten()
3 for i in range(1, 11):
4 feature = 'num_sold_lag' + str(i)
5 sns.scatterplot(x=df[feature], y=df['num_sold'], ax=axs[i - 1], s=5)
6 axs[i - 1].set_title(feature, fontsize = 14)
7 ​
8 plt.suptitle('Scatterplot - Lags vs Sales', fontsize = 18)
9 plt.tight_layout();

In [25]:

1 df.drop(columns = ['num_sold_lag3', 'num_sold_lag4', 'num_sold_lag5', 'num_sold_lag6',


In [28]:

1 # Creating a periodogram.
2 ​
3 fs = pd.Timedelta("1Y") / pd.Timedelta("1D")
4 freqencies, spectrum = periodogram(
5 train['num_sold'],
6 fs=fs,
7 detrend='linear',
8 window="boxcar",
9 scaling='spectrum',
10 )
11 fig, ax = plt.subplots(figsize = (16, 5))
12 ax.step(freqencies, spectrum, color="purple")
13 ax.set_xscale("log")
14 ax.set_xticks([1, 2, 4, 6, 12, 26, 52, 104])
15 ax.set_xticklabels(
16 [
17 "Annual (1)",
18 "Semiannual (2)",
19 "Quarterly (4)",
20 "Bimonthly (6)",
21 "Monthly (12)",
22 "Biweekly (26)",
23 "Weekly (52)",
24 "Semiweekly (104)",
25 ],
26 rotation=90,
27 )
28 ax.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
29 ax.set_ylabel("Variance")
30 ax.set_title("Periodogram", fontsize = 18);
31 ​
In [29]:

1 # choosing order = 4 because semiweekly, biweekly, and weekly periodicity was observere
2 fourier = CalendarFourier(freq="W", order=4)
3 data = df.reset_index().set_index([pd.DatetimeIndex(df.reset_index()['date'])])
4 y = data['num_sold']
5 dp = DeterministicProcess(
6 index= y.index,
7 order=1,
8 seasonal=False,
9 constant=False,
10 additional_terms = [fourier],
11 drop = True
12 )
13 ​
14 X = dp.in_sample()
15 X.shape

Out[29]:

(32868, 7)

In [31]:

1 df=df.reset_index()
2 df['date']=pd.to_datetime(df['date'])
3 df=df.reset_index().set_index('date')
4 df.head()

Out[31]:

index num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7

date

2015-01-01 0 329 NaN NaN NaN

2015-01-01 1 520 NaN NaN NaN

2015-01-01 2 146 NaN NaN NaN

2015-01-01 3 572 NaN NaN NaN

2015-01-01 4 911 NaN NaN NaN


In [32]:

1 df=df.drop('index',axis=1).join(X.drop('trend',axis=1),how='inner')
2 df.head()

Out[32]:

sin(1,freq=W- cos(1,freq=W-
num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN)

date

2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01

2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01

2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01

2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01

2015-
329 NaN NaN NaN 0.433884 -0.900969
01-01
In [35]:

1 df=df.reset_index()
2 X=df[['date','num_sold']]
3 X['date']=X['date'].apply(lambda x: x.toordinal())
4 #https://www.analyticsvidhya.com/blog/2021/05/k-means-clustering-with-mall-customer-seg
5 #Importing KMeans from sklearn
6 from sklearn.cluster import KMeans
7 wcss=[]
8 for i in range(1,11):
9 km=KMeans(n_clusters=i)
10 km.fit(X)
11 wcss.append(km.inertia_)
12 #The elbow curve
13 plt.figure(figsize=(12,6))
14 plt.plot(range(1,11),wcss)
15 plt.plot(range(1,11),wcss, linewidth=2, color="red", marker ="8")
16 plt.xlabel("K Value")
17 plt.xticks(np.arange(1,11,1))
18 plt.ylabel("WCSS")
19 plt.show()
In [36]:

1 #Taking 5 clusters
2 km1=KMeans(n_clusters=5)
3 #Fitting the input data
4 km1.fit(X)
5 #predicting the labels of the input data
6 y=km1.predict(X)
7 #adding the labels to a column named label
8 df["label"] = y
9 #The new dataframe with the clustering done
10 df.head()

Out[36]:

sin(1,freq=W- cos(1,freq=W
date num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN

2015-
0 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
1 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
2 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
3 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
4 329 NaN NaN NaN 0.433884 -0.90096
01-01
In [38]:

1 # extract date features


2 df['year'] = pd.to_datetime(df['date']).dt.year
3 df['month_name'] = pd.to_datetime(df['date']).dt.month_name()
4 df['month'] = pd.to_datetime(df['date']).dt.month
5 df['day'] = pd.to_datetime(df['date']).dt.day
6 df['day_of_week_name'] = pd.to_datetime(df['date']).dt.day_name()
7 df['day_of_week'] = pd.to_datetime(df['date']).dt.day_of_week
8 df.head()

Out[38]:

sin(1,freq=W- cos(1,freq=W
date num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7
SUN) SUN

2015-
0 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
1 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
2 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
3 329 NaN NaN NaN 0.433884 -0.90096
01-01

2015-
4 329 NaN NaN NaN 0.433884 -0.90096
01-01

Standardaization

In [41]:

1 df.fillna(0,inplace=True)
2 from sklearn.preprocessing import StandardScaler
3 scaler=StandardScaler()
4 scaler.fit(df[['num_sold','num_sold_lag1','num_sold_lag2','num_sold_lag7']])
5 scaled=pd.DataFrame(scaler.transform(df[['num_sold','num_sold_lag1','num_sold_lag2','nu
6 scaled.columns=['num_sold','num_sold_lag1','num_sold_lag2','num_sold_lag7']
7 scaled.head()

Out[41]:

num_sold num_sold_lag1 num_sold_lag2 num_sold_lag7

0 0.066655 -1.091752 -1.091752 -1.091752

1 0.066655 -1.091752 -1.091752 -1.091752

2 0.066655 -1.091752 -1.091752 -1.091752

3 0.066655 -1.091752 -1.091752 -1.091752

4 0.066655 -1.091752 -1.091752 -1.091752


In [45]:

1 df=df[['date', 'sin(1,freq=W-SUN)', 'cos(1,freq=W-SUN)', 'sin(2,freq=W-SUN)',


2 'cos(2,freq=W-SUN)', 'cos(3,freq=W-SUN)', 'sin(4,freq=W-SUN)', 'label',
3 'year', 'month_name', 'month', 'day', 'day_of_week',
4 'day_of_week_name']].join(scaled,how='inner')

In [46]:

1 df=pd.get_dummies(df)
2 df.head()

Out[46]:

sin(1,freq=W- cos(1,freq=W- sin(2,freq=W- cos(2,freq=W- cos(3,freq=W- sin(4,freq=W-


date
SUN) SUN) SUN) SUN) SUN) SUN)

2015-
0 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01

2015-
1 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01

2015-
2 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01

2015-
3 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01

2015-
4 0.433884 -0.900969 -0.781831 0.62349 -0.222521 -0.974928
01-01

In [50]:

1 X_train = df[df.date < '2018-06-30'].drop(['num_sold','date'], axis=1)


2 Y_train = df[df.date < '2018-06-30']['num_sold']
3 X_valid = df[(df.date > '2018-06-30') & (df.date < '2018-12-31')].drop(['num_sold','dat
4 Y_valid = df[(df.date > '2018-06-30') & (df.date < '2018-12-31')]['num_sold']
5 X_test = df[df.date > '2018-12-31'].drop(['num_sold','date'], axis=1)
In [51]:

1 ts = time.time()
2 ​
3 model = XGBRegressor(
4 max_depth=8,
5 n_estimators=1000,
6 min_child_weight=300,
7 colsample_bytree=0.8,
8 subsample=0.8,
9 eta=0.3,
10 seed=42)
11 ​
12 model.fit(
13 X_train,
14 Y_train,
15 eval_metric="mape",
16 eval_set=[(X_train, Y_train), (X_valid, Y_valid)],
17 verbose=True,
18 early_stopping_rounds = 10)
19 ​
20 time.time() - ts

[0] validation_0-mape:4.85086 validation_1-mape:8.26721

[1] validation_0-mape:3.42136 validation_1-mape:5.78417

[2] validation_0-mape:2.44605 validation_1-mape:4.01639

[3] validation_0-mape:1.77397 validation_1-mape:2.79364

[4] validation_0-mape:1.34831 validation_1-mape:2.01040

[5] validation_0-mape:1.10281 validation_1-mape:1.44146

[6] validation_0-mape:0.95003 validation_1-mape:1.15750

[7] validation_0-mape:0.86617 validation_1-mape:1.00791

[8] validation_0-mape:0.81581 validation_1-mape:0.91798

[9] validation_0-mape:0.79867 validation_1-mape:0.86188

[10] validation_0-mape:0.78690 validation_1-mape:0.80864

[11] validation_0-mape:0.77299 validation_1-mape:0.80242

[12] validation_0-mape:0.75649 validation_1-mape:0.80535

[13] validation_0-mape:0.75429 validation_1-mape:0.81420

[14] validation_0-mape:0.75406 validation_1-mape:0.81496

[15] validation_0-mape:0.75047 validation_1-mape:0.81933

[16] validation_0-mape:0.75265 validation_1-mape:0.82055

[17] validation_0-mape:0.75017 validation_1-mape:0.82509

[18] validation_0-mape:0.75107 validation_1-mape:0.82793

[19] validation_0-mape:0.75062 validation_1-mape:0.82923

[20] validation_0-mape:0.75051 validation_1-mape:0.82955

[21] validation_0-mape:0.75093 validation_1-mape:0.82811

Out[51]:

15.90307068824768

Predictions
In [52]:

1 model.predict(X_test)

Out[52]:

array([0.3278924 , 0.3278924 , 0.3278924 , ..., 0.86840504, 0.86840504,

0.86840504], dtype=float32)

In [11]:

1 df= pd.read_excel("saaki test.xlsx")


2 df

Out[11]:

product_price product_title

0 $329.98 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...

1 $324.84 Sony PlayStation 4 (Latest Model)- 500 GB Jet ...

2 $324.83 Sony PlayStation 4 PS4 500 GB Jet Black Console

3 $350.00 Sony - PlayStation 4 500GB The Last of Us Rema...

4 308.00\nTrendingat\n 319.99 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...

... ... ...

32666 109.99, Sale74.95 Kamik Gamma2 Snow Boots (For Women)

32667 $175.00 Hunter Rain Boots - Tour Neoprene

32668 175.00, Sale119.95 Blondo Steffy Snow Boots - Waterproof (For Women)

32669 $169.00 Maine Pac Boots

32670 59.99, Sale39.95 Itasca Icebreaker Snow Boots - Waterproof, Ins...

32671 rows × 2 columns

In [12]:

1 df.head()

Out[12]:

product_price product_title

0 $329.98 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...

1 $324.84 Sony PlayStation 4 (Latest Model)- 500 GB Jet ...

2 $324.83 Sony PlayStation 4 PS4 500 GB Jet Black Console

3 $350.00 Sony - PlayStation 4 500GB The Last of Us Rema...

4 308.00\nTrendingat\n 319.99 Sony PlayStation 4 (PS4) (Latest Model)- 500 G...

In [13]:

1 from fuzzywuzzy import fuzz


2 from fuzzywuzzy import process
3 import pandas
In [20]:

1 lst= df['product_title'].tolist()
2 #k=process.extract("Anti-Dark", lst)

In [23]:

1 k= process.extract("Anti-Dark", lst)

In [24]:

1 len(k)

Out[24]:

In [19]:

1 type(k)

Out[19]:

list

In [22]:

1 for i in k:
2 print(fuzz.ratio('Anti-Matter', i))
3

24

17

15

18

20

10

24

14

In [ ]:

1 ​

You might also like