02 B Regression Healthcare
02 B Regression Healthcare
-----------------------------------------------------------------------------------
--------------------------------------------------
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
HealthIns_BK = HealthIns.copy()
HealthIns.head()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display the dataset information
HealthIns.info()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Displaying Duplicate values with in dataset
HealthIns_dup = HealthIns[HealthIns.duplicated(keep='last')]
HealthIns_dup
-----------------------------------------------------------------------------------
--------------------------------------------------
# Remove the identified duplicate records
HealthIns = HealthIns.drop_duplicates()
HealthIns.shape
-----------------------------------------------------------------------------------
--------------------------------------------------
# Re-setting the row index
HealthIns = HealthIns.reset_index(drop=True)
HealthIns_BK2 = HealthIns.copy()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display the dataset information after delection of duplicates
HealthIns.info()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display the unique values of the all the variables
HealthIns.nunique()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display the missing values information of variables
HealthIns.isnull().sum()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display 'Gender' categorical variable
HealthIns['Gender'].value_counts()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Replace 'Gender' variable and convert to integer value.
HealthIns['Smoker'].value_counts()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Replace 'Smoker' variable and convert as a integer value.
HealthIns['Region'].value_counts()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Replace 'Region' variable and convert to integer value.
HealthIns.info()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Delete variables which are not influencing the target variable
del HealthIns['Record_ID']
HealthIns.head()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display all the column variables
HealthIns.columns
-----------------------------------------------------------------------------------
--------------------------------------------------
# Identify the variables for scaling (MinMaxScalar)
IndepVar = []
for col in HealthIns.columns:
if col != 'Expenses':
IndepVar.append(col)
TargetVar = 'Expenses'
x = HealthIns[IndepVar]
y = HealthIns[TargetVar]
-----------------------------------------------------------------------------------
--------------------------------------------------
# Split the data into train and test (random sampling)
x_test[cols] = mmscaler.fit_transform(x_test[cols])
#x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)
-----------------------------------------------------------------------------------
--------------------------------------------------
Multiple Regression Algorithm
-----------------------------------------------------------------------------------
--------------------------------------------------
# Train the algorithm and build the model with train dataset
ModelRGR = LinearRegression()
ModelRGR.fit(x_train, y_train)
y_pred = ModelRGR.predict(x_test)
# Define the function to calculate the MAPE - Mean Absolute Percentage Error
# Evaluation of MAPE
ResultsFinal.sample(5)
-----------------------------------------------------------------------------------
--------------------------------------------------
# Calculate the %of Error
ResultsFinal['%Error'] = round(((ResultsFinal['Expenses_A']-
ResultsFinal['Expenses_P'])/ResultsFinal['Expenses_A'])*100,3)
ResultsFinal.sample(5)
-----------------------------------------------------------------------------------
--------------------------------------------------