0% found this document useful (0 votes)
29 views8 pages

Stukas ML Loop Rev.r

This document imports data, cleans it, and runs multiple machine learning models to forecast economic growth. It contains functions for LASSO, random forest, ridge, elastic net, and SVM models. The document loops through 26 dates, trains models on previous data, and forecasts growth for each date using different models.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
29 views8 pages

Stukas ML Loop Rev.r

This document imports data, cleans it, and runs multiple machine learning models to forecast economic growth. It contains functions for LASSO, random forest, ridge, elastic net, and SVM models. The document loops through 26 dates, trains models on previous data, and forecasts growth for each date using different models.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 8

#set working directory

setwd("D:/Master of Applied Statistics @ IPB/BPS 2018/04 - Semester III/Studi


Kasus/Olah Data")

#set library
library(xlsx)
library(skimr)
library(forecastML)
library(forecast)
library(glmnet)
library(forecast)
library(randomForest)
library(TTR)
library(graphics)
library(tseries)
library(e1071)

#------------------------------------------------------------------------------
# IMPORT DATA DARI AWAL & CLEANING
#------------------------------------------------------------------------------

#import data dalam bentuk excel


dataq<-read.xlsx("dataQ.xlsx",header = T, sheetIndex = 1)
var.info<-read.xlsx("dataQ.xlsx",header = F, sheetIndex = 2)

#look into the data


View(dataq)
View(var.info)
str(dataq)
dim(dataq)

#calculate GDPCuP GROWTH


growth<-rep(NA,nrow(dataq))
for(i in 2:nrow(dataq)){
growth[i]<-((dataq$GDPCuP[i]-dataq$GDPCuP[i-1])/dataq$GDPCuP[i-1])*100
}
growth

#join var growth into dataq


dataq$growth<-growth
View(dataq)
colnames(dataq)

#Check data structure


skim(dataq)
#missing variable predictor: x12,x13,x14,x15,x16,x18

#Subset data, exclude the missing rows


dataq.new<-dataq[complete.cases(dataq),]
dim(dataq.new)
View(dataq.new)
colnames(dataq.new)

#subset data contain fix y and fix x


datafix<-data.frame(growth=dataq.new$growth,dataq.new[,4:21])
colnames(datafix)
skim(datafix)
dim(datafix) #42*19
#export datafix into .rds
saveRDS(datafix,"datafix.rds")

#------------------------------------------------------------------------------
# IMPORT DATA YANG SUDAH CLEANING
#------------------------------------------------------------------------------
#kalau ingin running ulang, import datafix pakai sintax di bawah ini saja
datafix<-readRDS(file="datafix.rds")

#------------------------------------------------------------------------------
# FUNGSI-FUNGSI YANG AKAN DIPAKAI
#------------------------------------------------------------------------------
# MODEL FUNCTION FOR LASSO
model_function <- function(data, my_outcome_col) {
x <- data[, -1, drop = FALSE]
y <- data[, 1, drop = FALSE]
x <- as.matrix(x, ncol = ncol(x))
y <- as.matrix(y, ncol = ncol(y))
set.seed(2345)
model <- glmnet::cv.glmnet(x, y,nfolds = length(y),grouped=FALSE)
return(model)
}

#MODEL FUNCTION FOR RANDOM FOREST


model_function_2 <- function(data) {
outcome_names <- names(data)[1]
model_formula <- formula(paste0(outcome_names, "~ ."))
set.seed(2345)
model <- randomForest::randomForest(formula = model_formula, data = data,
ntree = 500)
return(model)
}

# MODEL FUNCTION FOR RIDGE


model_function_3 <- function(data, my_outcome_col) {
x <- data[, -1, drop = FALSE]
y <- data[, 1, drop = FALSE]
x <- as.matrix(x, ncol = ncol(x))
y <- as.matrix(y, ncol = ncol(y))
set.seed(2345)
model <- glmnet::cv.glmnet(x, y,nfolds = length(y),grouped=FALSE,alpha=0)
return(model)
}

# MODEL FUNCTION FOR ELASTIC NET


model_function_4 <- function(data, my_outcome_col) {
x <- data[, -1, drop = FALSE]
y <- data[, 1, drop = FALSE]
x <- as.matrix(x, ncol = ncol(x))
y <- as.matrix(y, ncol = ncol(y))
set.seed(2345)
model <- glmnet::cv.glmnet(x, y,nfolds = length(y),grouped=FALSE,alpha=0.5)
return(model) # This model is the first argument in the user-defined predict()
function below.
}

# MODEL FUNCTION FOR SVM


model_function_5 <- function(data, my_outcome_col) {
outcome_names <- names(data)[1]
model_formula <- formula(paste0(outcome_names, "~ ."))
set.seed(2345)
model <- svm(formula = model_formula, data = data,
epsilon=0,cost=10,gamma=0.01)
return(model)
}

# PREDICTION FUNCTION FOR LASSO


prediction_function <- function(model, data_features) {
x <- as.matrix(data_features, ncol = ncol(data_features))
data_pred <- data.frame("y_pred" = predict(model, x, s = "lambda.min"))
return(data_pred)
}

# PREDICTION FUNCTION FOR RANDOM FOREST


prediction_function_2 <- function(model, data_features) {
data_pred <- data.frame("y_pred" = predict(model, data_features))
return(data_pred)
}

# PREDICTION FUNCTION FOR RIDGE


prediction_function_3 <- function(model, data_features) {
x <- as.matrix(data_features, ncol = ncol(data_features))
data_pred <- data.frame("y_pred" = predict(model, x))
return(data_pred)
}

# PREDICTION FUNCTION FOR ELASTIC NET


prediction_function_4 <- function(model, data_features) {
x <- as.matrix(data_features, ncol = ncol(data_features))
data_pred <- data.frame("y_pred" = predict(model, x, s = "lambda.min"))
return(data_pred)
}

# PREDICTION FUNCTION FOR SVM


prediction_function_5 <- function(model, data_features) {
data_pred <- data.frame("y_pred" = predict(model, data_features))
return(data_pred)
}

#------------------------------------------------------------------------------
# Set the date for whole data
date_frequency<-"3 months"
dates<-seq(as.Date("2009-09-01"),as.Date("2019-12-01"),by=date_frequency)
length(dates)

#------------------------------------------------------------------------------
# LOOPING UNTUK 26 DATA YANG MAU DI NOWCAST
#------------------------------------------------------------------------------

#tanggal yang mau di nowcast: 26 tanggal


dates[(nrow(datafix)-26+1):nrow(datafix)]

#set for data preparations


outcome_col<-1 # The column index of our GDP_GROWTH outcome.
horizons <- 1 # 8 models that forecast 1, 1:2, 1:3, ...,and 1:8 time steps ahead.
lookback <- c(1:4,8) # A lookback of 1 to 4 dataset rows (1:15 * 'date frequency'
if dates are given).
ypredict<-NULL
for(i in 1:26){

#split data
#set dataset untuk pemodelan
training<-datafix[1:(nrow(datafix)-(27-i)),]

#data train list


data_train <- forecastML::create_lagged_df(training,
type = "train",
method = "direct",
outcome_col = outcome_col,
lookback = lookback,
horizon = horizons,
date=dates[1:nrow(training)],
frequency = date_frequency)

#windows
windows<- forecastML::create_windows(data_train, window_length = 0)

#------------------------------------------------------------------------------
#FINAL MODEL UNTUK MASING2 METODE
model_results <- forecastML::train_model(data_train,
windows = windows,
model_name = "LASSO",
model_function = model_function)
model_results_2 <- forecastML::train_model(data_train,
windows = windows,
model_name = "RF",
model_function = model_function_2)
model_results_3 <- forecastML::train_model(data_train,
windows = windows,
model_name = "RIDGE",
model_function = model_function_3)
model_results_4 <- forecastML::train_model(data_train,
windows = windows,
model_name = "ENET",
model_function = model_function_4)
model_results_5 <- forecastML::train_model(data_train,
windows = windows,
model_name = "SVM",
model_function = model_function_5)

#------------------------------------------------------------------------------
# Forecast with FINAL MODEL

# Forward-looking forecast data.frame.


data_forecast_list <- forecastML::create_lagged_df(training,
type = "forecast",
outcome_col = outcome_col,
horizons = horizons,
lookback=lookback,
date=dates[1:nrow(training)],
frequency = date_frequency)
# Forecasts h=1
data_forecast <- predict(model_results,model_results_2,model_results_3,
model_results_4,model_results_5,
prediction_function = list(prediction_function,
prediction_function_2,
prediction_function_3,
prediction_function_4,
prediction_function_5),
data=data_forecast_list)

ypredict<-rbind(ypredict,data.frame(model=data_forecast$model,
growth_pred=data_forecast$growth_pred))

}
ypredict

#hasil nowcast masing masing metode ML


meto<-c("LASSO","RF","RIDGE","ENET","SVM")
df<-matrix(data=NA,ncol=1,nrow=26)
for(met in meto){
y<-ypredict$growth_pred[ypredict$model==met]
df<-cbind(df,data.frame(y))
}
df<-df[,-1]
colnames(df)<-meto
head(df)

#import hasil neural network


NN<-readRDS(file="NN.rds")
df$NN<-NN
head(df)

#------------------------------------------------------------------------------
# EVALUASI MODEL MACHINE LEARNING
#------------------------------------------------------------------------------
metode<-c("LASSO","RF","RIDGE","ENET","SVM","NN")
actual<-datafix$growth[(nrow(datafix)-26+1):nrow(datafix)]

#Evaluasi kedekatan nilai


eval<-NULL
for(i in metode){
acc<-accuracy(df[,i],actual)
eval<-rbind(eval,data.frame(metode=i,acc))
}
eval
MAD<-colMeans(abs(-df+actual))

#evaluasi kedekatan pola pakai nilai korelasi actual dan setial metode
df.all<-df
df.all$actual<-actual
(korelasi<-cor(df.all[,1:6],df.all[,1:7])[,7])

(eval.ml<-data.frame(eval,MAD,korelasi))

#export df.all
saveRDS(df.all,"df.result.rds")

#------------------------------------------------------------------------------
# AR(1) MODELING
#------------------------------------------------------------------------------
#series
y<-datafix[1:nrow(datafix),1]
#plot
plot.ts(y,ylab="%", xlab="time",main="PDB Current Price Growth")

#check stationarity: acf & unit root test


acf(y,lag.max = 20,main="ACF PDB CuP Gr")
adf.test(y)
#kesimpulan: Tidak tolak H0 - tidak stasioner

#differencing
ydiff1<-diff(y)
plot.ts(ydiff1,ylab="%", xlab="time",main="diff1 PDB Growth")
adf.test(ydiff1)
acf(ydiff1,lag.max = 20,main="ACF PDB CuP Gr")

#Nowcast 26 quarter with ARIMA(1,0,0) model


y.ar<-NULL
for(i in 1:26){
#split data
#jumlah data yang mau di nowcast:
training<-datafix[1:(nrow(datafix)-(27-i)),1]

#model AR(1)
model<-arima(training,order=c(1,0,0),method="ML")

#forecast
duga<-forecast(model)
#take 1 Quarter
pred<-duga$mean[1]
#make dataframe
y.ar<-rbind(y.ar,data.frame(ar=pred))
}
y.ar
#evaluasi
ar1<-(accuracy(ts(y.ar),actual))
MAD<-colMeans(abs(actual-y.ar))
(eval.ar<-data.frame(ar1,MAD,korelasi=cor(y.ar,actual)))

#plot
plot.ts(actual,ylab="GDP Growth",ylim=c(-6,7),main="ARIMA(1,1,0) vs Actual")
lines(y.ar,col="red")
plot(actual,t(y.ar),main="AR(1)",ylab="forecast")

#------------------------------------------------------------------------------
# TIME SERIES plot: ACTUAL VS NOWCAST
#------------------------------------------------------------------------------
par()
#multiple plot
m <- matrix(c(1,2,3,4,5,6,7,7,7),nrow = 3,ncol = 3,byrow = TRUE)
m
nf<-layout(mat = m,heights = c(0.4,0.4,0.2))
layout.show(nf)
par(mar = c(5,5,5,5))
time<-c("2014","2015","2016","2017","2018","2019")
#plot RF
plot.ts(actual,ylab="GDP Growth",ylim=c(-6,7),main="RF",xaxt="n")
lines(df$RF,col="red",lty=3)
axis(1, at=c(3,7,11,15,19,23), labels=time,las=1,srt=315, cex.axis=0.8)
#plot LASSO
plot.ts(actual,ylab="GDP Growth",ylim=c(-6,7),main="LASSO",xaxt="n")
lines(df$LASSO,col="red",lty=3)
axis(1, at=c(3,7,11,15,19,23), labels=time,las=1,srt=315, cex.axis=0.8)
#plot RIDGE
plot.ts(actual,ylab="GDP Growth",ylim=c(-6,7),main="RIDGE",xaxt="n")
lines(df$RIDGE,col="red",lty=3)
axis(1, at=c(3,7,11,15,19,23), labels=time,las=1,srt=315, cex.axis=0.8)
#plot ENET
plot.ts(actual,ylab="GDP Growth",ylim=c(-6,7),main="ENET",xaxt="n")
lines(df$ENET,col="red",lty=3)
axis(1, at=c(3,7,11,15,19,23), labels=time,las=1,srt=315, cex.axis=0.8)
#plot SVM
plot.ts(actual,ylab="GDP Growth",ylim=c(-6,7),main="SVM",xaxt="n")
lines(df$SVM,col="red",lty=3)
axis(1, at=c(3,7,11,15,19,23), labels=time,las=1,srt=315, cex.axis=0.8)
#plot NN
plot.ts(actual,ylab="GDP Growth",ylim=c(-6,7),main="NN",xaxt="n")
lines(df$NN,col="red",lty=3)
axis(1, at=c(3,7,11,15,19,23), labels=time,las=1,srt=315, cex.axis=0.8)
par(mar = c(1,1,1.5,1))
plot(1, type = "n", axes=FALSE, xlab="", ylab="")
legend(x="top", inset=0, legend = c("Actual","Forecast"),
col=c("black","red"), lty=c(1,3),lwd=3,cex=0.9, horiz=TRUE)
par(mfrow = c(1, 1))

#------------------------------------------------------------------------------
# SCATTER plot: ACTUAL VS NOWCAST
#------------------------------------------------------------------------------
m <- matrix(c(1,2,3,4,5,6),nrow = 2,ncol = 3, byrow = TRUE)
layout(mat = m,heights = c(0.4,0.4,0.2))
par(mar = c(5,5,5,5))
plot(actual,df$RF,main="RF",ylab="Forecast",xlab="Actual")
plot(actual,df$LASSO,main="LASSO",ylab="Forecast",xlab="Actual")
plot(actual,df$RIDGE,main="RIDGE",ylab="Forecast",xlab="Actual")
plot(actual,df$ENET,main="ENET",ylab="Forecast",xlab="Actual")
plot(actual,df$SVM,main="SVM",ylab="Forecast",xlab="Actual")
plot(actual,df$NN,main="NN",ylab="Forecast",xlab="Actual")
par(mfrow = c(1, 1))
#------------------------------------------------------------------------------
# ENSEMBLE: EQUAL WEIGHTING
#------------------------------------------------------------------------------
#equal weighting for 3 methods: RF,ENET, NN
ensemble1<-rowMeans(df[,c("RF","ENET","NN")])
cbind(accuracy(ensemble1,actual),
MAD=mean(abs(actual-ensemble1)),
korelasi=cor(ensemble1,actual))

#equal weighting for 2 methods: RF,ENET


ensemble2<-rowMeans(df[,c("RF","ENET")])
cbind(accuracy(ensemble2,actual),
MAD=mean(abs(actual-ensemble2)),
korelasi=cor(ensemble2,actual))

#equal weighting for 2 methods: RF,NN


ensemble3<-rowMeans(df[,c("RF","NN")])
cbind(accuracy(ensemble3,actual),
MAD=mean(abs(actual-ensemble3)),
korelasi=cor(ensemble3,actual))

#equal weighting for 2 methods: ENET, NN


ensemble4<-rowMeans(df[,c("ENET","NN")])
cbind(accuracy(ensemble4,actual),
MAD=mean(abs(actual-ensemble4)),
korelasi=cor(ensemble4,actual))

#------------------------------------------------------------------------------
# ENSEMBLE: LASSO REGRESSION
#------------------------------------------------------------------------------
head(df.all)
x<-as.matrix(df.all[,-7])
y<-as.matrix(df.all[,7])
set.seed(1234)
cv_glm<-cv.glmnet(x,y,standardize=TRUE,alpha=1,
type.measure = 'mse',nfolds = 5)

#nilai lambda optimal


(best.lam<-cv_glm$lambda.min)
#model lasso
reg_lasso<-glmnet(x,y,standardize = TRUE, alpha = 1)
#koefisien lasso
(lasso.coef<-coef(reg_lasso,,s=best.lam))
#fitted lasso
(lasso.fit<-predict(reg_lasso,newx=x,type="response",s=best.lam)[,1])
#evaluasi
cbind(accuracy(lasso.fit,actual),
MAD=mean(abs(actual-lasso.fit)),
korelasi=cor(lasso.fit,actual))

#------------------------------------------------------------------------------
# PLOT HASIL ENSEMBLE
#------------------------------------------------------------------------------
m <- matrix(c(1,2,3,4,5,5),nrow = 3,ncol = 2, byrow = TRUE)
layout(mat = m,heights = c(0.4,0.4,0.2))
par(mar = c(5,5,5,5))
time<-c("2014","2015","2016","2017","2018","2019")
plot.ts(actual,main="RF-ENET-NN",ylab="GDP Growth",ylim=c(-6,7),xaxt="n")
lines(ensemble1,col="red",lty=3)
axis(1, at=c(3,7,11,15,19,23), labels=time,las=1,srt=315, cex.axis=0.9)
plot.ts(actual,main="RF-ENET",ylab="GDP Growth",ylim=c(-6,7),xaxt="n")
lines(ensemble2,col="red",lty=3)
axis(1, at=c(3,7,11,15,19,23), labels=time,las=1,srt=315, cex.axis=0.9)
plot.ts(actual,main="RF-NN",ylab="GDP Growth",ylim=c(-6,7),xaxt="n")
lines(ensemble3,col="red",lty=3)
axis(1, at=c(3,7,11,15,19,23), labels=time,las=1,srt=315, cex.axis=0.9)
plot.ts(actual,main="Lasso Reg",ylab="GDP Growth",ylim=c(-6,7),xaxt="n",xlab="")
lines(lasso.fit,col="red",lty=3)
axis(1, at=c(3,7,11,15,19,23), labels=time,las=1,srt=315, cex.axis=0.9)
par(mar = c(1,1,1.5,1))
plot(1, type = "n", axes=FALSE, xlab="", ylab="")
legend(x="top", inset=0, legend = c("Actual","Forecast"),
col=c("black","red"), lty=c(1,3),lwd=3,cex=0.9, horiz=TRUE)
par(mfrow = c(1, 1))

You might also like