0% found this document useful (0 votes)
5 views

Predict Inflation Using Randomforest Regression

This document cleans and preprocesses inflation data from a CSV file. It handles missing values, encodes categorical data, splits the data into training and test sets, and uses random forest and linear regression models to predict inflation values.

Uploaded by

ahmed salem
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

Predict Inflation Using Randomforest Regression

This document cleans and preprocesses inflation data from a CSV file. It handles missing values, encodes categorical data, splits the data into training and test sets, and uses random forest and linear regression models to predict inflation values.

Uploaded by

ahmed salem
Copyright
© © All Rights Reserved
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 3

import pandas as pd

import numpy as np

#read data
dset = pd.read_csv("/content/Data_Inflation.csv",sep=",", encoding='latin-1')

#View the data


dset.head(10)

dset.dtypes.value_counts()

#Types into object


dset.dtypes == 'object'

#Drop columns Unit of Measurement


dset = dset.drop(columns=['Unit of Measurement'])

print(dset.columns.tolist())

#rename the columns


dset.rename(columns={' Country Code': 'CountryCode', ' Subregion': 'Subregion'},
inplace=True)

#Inflation box plot


import seaborn as sns
# Box plot
sns.boxplot(dset.Inflation)

dset['Subregion'].value_counts()

dset["Inflation"].describe() #Inflation Description

#total Empty in dataset


dset.isna().sum().sum()

#View Missing values in all columns


cc = dset['CountryCode'].isna().sum()
sr = dset['Subregion'].isna().sum()
rm = dset['RegionalMember'].isna().sum()
ifn = dset['Inflation'].isna().sum()
print("Countrycode",cc)
print("Subregion",sr)
print("Regional Membr",rm)
print("Inflation",ifn)

dset.info()

#Replace Inflation missing values with mode


dset["Inflation"] = dset["Inflation"].fillna("not available")

#Number of missing values per column


dset.isnull().sum()

dset = dset.dropna(axis=0)

dset.isnull().sum()

dset.info()
#Again now replace not availble with nan
dset['Inflation']= dset['Inflation'].replace('not available', np.nan)

dset.head(10)

#Sort values according to nan of inflation


dset = dset.sort_values(by="Inflation", ascending=False, na_position='first')

#One hot encoding to whole data


dset_one_hot_encoding = pd.get_dummies(dset)

print("Dataset dimension",dset_one_hot_encoding.shape)

print(dset_one_hot_encoding)

dset.isnull().sum()

#Dividing data into two groups having missing values and non missing values
column_with_nan = "Inflation"
data_with_nan =
dset_one_hot_encoding[dset_one_hot_encoding[column_with_nan].isnull()]
data_without_nan =
dset_one_hot_encoding[dset_one_hot_encoding[column_with_nan].notnull()]

print("Dset without nan dimension",data_without_nan.shape)


print("Dset with nan dimension",data_with_nan.shape)

#Create train from non_missing values


X_train = data_without_nan.drop('Inflation',axis=1)
y_train = data_without_nan["Inflation"]

xx_train = X_train.iloc[1:6]
yy_train = y_train.iloc[1:6]

xx_train

X_train = X_train.values
y_train = y_train.values
xx_train = xx_train.values
yy_train = yy_train.values

X_train

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor()
rf.fit(X_train, y_train)

#Create test from missing values


x_test = data_with_nan.drop('Inflation',axis=1)
# x_test.values

Train_x = X_train.shape
Train_y = y_train.shape
print("X_train",Train_x)
print("y_train",Train_y)
test_x = x_test.shape
print("X_test",test_x)
predicted_values = rf.predict(xx_train)

print("Predicted Values from RandomForest",predicted_values)

print(predicted_values)
print(yy_train)

type(yy_train)

#Difference in original and predicted

import sklearn.metrics as metrics


import matplotlib.pyplot as plt
y = np.array(predicted_values)
yhat = np.array(yy_train)
x = list(range(len(y)))

plt.plot(x, y, color="blue", label="original")


plt.plot(x, yhat, color="red", label="predicted")
plt.legend()
plt.show()

from sklearn.metrics import mean_absolute_error


from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
mape = mean_absolute_percentage_error(yhat,y)
rmse = mean_squared_error(yhat,y, squared=False)
print("Mean Square Error",rmse)
print("Mean absolute percentage error",mape)
print("Mean Absolute Error",mean_absolute_error(yy_train,predicted_values))

from sklearn.linear_model import LinearRegression


lr = LinearRegression()

lr.fit(X_train,y_train)

#Predict for Inflation


y_pred_lr = lr.predict(x_test)

print("Predicted Values from LinearRegression",y_pred_lr)


print("Original",yy_train)

You might also like