Time Series Analysis
Time Series Analysis
Analysis----------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
df['Month'] = pd.to_datetime(df['Month'])
df.set_index('Month', inplace=True)
plt.rcParams['figure.figsize'] = 8,4
df.plot()
def adf_test(values):
result=adfuller(values)
labels = ['ADF Test Statistic','p-value','#Lags Used','Number of Observations
Used']
for value,label in zip(result,labels):
print(label+' : '+str(value) )
if result[1] <= 0.05:
print("strong evidence against the null hypothesis(Ho), reject the null
hypothesis. Data has no unit root and is stationary")
else:
print("weak evidence against null hypothesis, time series has a unit root,
indicating it is non-stationary ")
adf_test(df['Sales'])
------------Difference
df['Sales First Difference'] = df['Sales']-df['Sales'].shift(1)
# Since the data is seasonal(sales cycle usually happened during a year, a seasonal
period)
df['Seasonal First Difference'] = df['Sales']-df['Sales'].shift(12)
#Or
df['Sales'].diff()
plt.rcParams['figure.figsize'] = 8,4
rolmean = df['Sales'].rolling(12).mean()
rolstd = df['Sales'].rolling(12).std()
orig = plt.plot(df['Sales'], color='blue',label='Original')
mean = plt.plot(rolmean, color='red', label='Rolling Mean')
std = plt.plot(rolstd, color='black', label = 'Rolling Std')
plt.legend()
----------------Decomposing
from statsmodels.tsa.seasonal import seasonal_decompose
The right order of differencing is the minimum differencing required to get a near-
stationary series which roams around a defined mean and the ACF plot reaches to
zero fairly quick.
If the autocorrelations are positive for many number of lags (10 or more), then the
series needs further differencing. On the other hand, if the lag 1 autocorrelation
itself is too negative, then the series is probably over-differenced.
In the event, you can’t really decide between two orders of differencing, then go
with the order that gives the least standard deviation in the differenced series.
plt.show()
plt.rcParams.update({'figure.figsize':(9,3), 'figure.dpi':120})
plt.show()
model=ARIMA(df['Sales'],order=(1,1,1))
model_fit=model.fit()
model_fit.summary()
df['forecast']=model_fit.predict(start=90,end=103,dynamic=True)
df[['Sales','forecast']].plot(figsize=(12,8))
future_datest_df=pd.DataFrame(index=future_dates[1:],columns=df.columns)
future_df=pd.concat([df,future_datest_df])
future_df['forecast'] = results.predict(start = 104, end = 120, dynamic= True)
future_df[['Sales', 'forecast']].plot(figsize=(12, 8))
import warnings
import itertools
import numpy as np
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight')
import pandas as pd
import statsmodels.api as sm
import matplotlib
matplotlib.rcParams['axes.labelsize'] = 14
matplotlib.rcParams['xtick.labelsize'] = 12
matplotlib.rcParams['ytick.labelsize'] = 12
matplotlib.rcParams['text.color'] = 'k'
# You can specify the col_use when reading in files, or drop the unuse col later
df = pd.read_excel('C:\\Users\\wooju\\Desktop\\Python Programing\\Python Learning
Journey\\Dataset\\Superstore.xls',
sheet_name = 'Orders', usecols=['Order Date', 'Segment'])
if you were interested in summarizing all of the sales by month, you could use the
resample function. The tricky part about using resample is that it only operates on
an index. In this data set, the data is not indexed by the date column so resample
would not work without restructuring the data. In order to make it work, use
set_index to make the date column an index and then resample