0% found this document useful (0 votes)
25 views5 pages

02 B Regression Healthcare

Uploaded by

rayachotiusa
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
25 views5 pages

02 B Regression Healthcare

Uploaded by

rayachotiusa
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

Healthcare Data

-----------------------------------------------------------------------------------
--------------------------------------------------
# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore harmless warnings

import warnings
warnings.filterwarnings("ignore")

# Set to display all the columns in dataset

pd.set_option("display.max_columns", None)

# Import psql to run queries

import pandasql as psql


-----------------------------------------------------------------------------------
--------------------------------------------------
# load the Health Insurance dataset

HealthIns = pd.read_csv(r"E:\R3SPAnalytics\00 IIT KGP Hyd\00-LATA\Datasets\


Health_Ins_Expenses.csv", header=0)

# Copy to back-up file

HealthIns_BK = HealthIns.copy()

# Display the first 5 records

HealthIns.head()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display the dataset information

HealthIns.info()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Displaying Duplicate values with in dataset

HealthIns_dup = HealthIns[HealthIns.duplicated(keep='last')]

# Display the duplicate records

HealthIns_dup
-----------------------------------------------------------------------------------
--------------------------------------------------
# Remove the identified duplicate records

HealthIns = HealthIns.drop_duplicates()

# Display the shape of the dataset

HealthIns.shape
-----------------------------------------------------------------------------------
--------------------------------------------------
# Re-setting the row index

HealthIns = HealthIns.reset_index(drop=True)

# Copy file to back-up file after deletion of duplicate records

HealthIns_BK2 = HealthIns.copy()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display the dataset information after delection of duplicates

HealthIns.info()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display the unique values of the all the variables

HealthIns.nunique()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display the missing values information of variables

HealthIns.isnull().sum()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display 'Gender' categorical variable

HealthIns['Gender'].value_counts()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Replace 'Gender' variable and convert to integer value.

HealthIns['Gender'] = HealthIns['Gender'].str.replace('female', '0')


HealthIns['Gender'] = HealthIns['Gender'].str.replace('male', '1')
HealthIns['Gender'] = HealthIns['Gender'].astype(int)
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display 'Smoker' categorical variable

HealthIns['Smoker'].value_counts()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Replace 'Smoker' variable and convert as a integer value.

HealthIns['Smoker'] = HealthIns['Smoker'].str.replace('no', '0')


HealthIns['Smoker'] = HealthIns['Smoker'].str.replace('yes', '1')
HealthIns['Smoker'] = HealthIns['Smoker'].astype(int)
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display 'Region' categorical variable

HealthIns['Region'].value_counts()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Replace 'Region' variable and convert to integer value.

HealthIns['Region'] = HealthIns['Region'].str.replace('northeast', '0')


HealthIns['Region'] = HealthIns['Region'].str.replace('northwest', '1')
HealthIns['Region'] = HealthIns['Region'].str.replace('southeast', '2')
HealthIns['Region'] = HealthIns['Region'].str.replace('southwest', '3')
HealthIns['Region'] = HealthIns['Region'].astype(int)
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display the dataset information after transformation of data

HealthIns.info()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Delete variables which are not influencing the target variable

del HealthIns['Record_ID']

# Display first 5 records

HealthIns.head()
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display all the column variables

HealthIns.columns
-----------------------------------------------------------------------------------
--------------------------------------------------
# Identify the variables for scaling (MinMaxScalar)

cols = ['Age', 'BMI', 'Children','Region']


-----------------------------------------------------------------------------------
--------------------------------------------------
# Identify the independent and Target (dependent) variables

IndepVar = []
for col in HealthIns.columns:
if col != 'Expenses':
IndepVar.append(col)

TargetVar = 'Expenses'

x = HealthIns[IndepVar]
y = HealthIns[TargetVar]
-----------------------------------------------------------------------------------
--------------------------------------------------
# Split the data into train and test (random sampling)

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,


random_state=42)

# Display the shape for train & test data

x_train.shape, x_test.shape, y_train.shape, y_test.shape


-----------------------------------------------------------------------------------
--------------------------------------------------
# Scaling the features by using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler(feature_range=(0, 1))


x_train[cols] = mmscaler.fit_transform(x_train[cols])
#x_train = mmscaler.fit_transform(x_train)
x_train = pd.DataFrame(x_train)

x_test[cols] = mmscaler.fit_transform(x_test[cols])
#x_test = mmscaler.fit_transform(x_test)
x_test = pd.DataFrame(x_test)
-----------------------------------------------------------------------------------
--------------------------------------------------
Multiple Regression Algorithm
-----------------------------------------------------------------------------------
--------------------------------------------------
# Train the algorithm and build the model with train dataset

from sklearn.linear_model import LinearRegression

# Create an object for regression model

ModelRGR = LinearRegression()

# Train the model with training dataset

ModelRGR.fit(x_train, y_train)

# Predict the model with test dataset

y_pred = ModelRGR.predict(x_test)

# Evaluation metrics for Regression model

from sklearn import metrics

print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test,


y_pred),3))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test,
y_pred),3))
print('Root Mean Squared Error (RMSE):',
round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Mean Absolute Percentage Error (MAPE):',
round(metrics.mean_absolute_percentage_error(y_test, y_pred)*100,3), '%')
print('Root Mean Squared Log Error (RMSLE):',
round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))

# Define the function to calculate the MAPE - Mean Absolute Percentage Error

def MAPE(y_test, y_pred):


y_test, y_pred = np.array(y_test), np.array(y_pred)
return np.mean(np.abs((y_test - y_pred) / y_test)) * 100

# Evaluation of MAPE

result = MAPE(y_test, y_pred)


print('Mean Absolute Percentage Error (MAPE):', round(result, 3), '%')

# Calculate Adjusted R squared values

r_squared = round(metrics.r2_score(y_test, y_pred),6)


adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)
-----------------------------------------------------------------------------------
--------------------------------------------------
# Display the Final results

Results = pd.DataFrame({'Expenses_A':y_test, 'Expenses_P':y_pred})

# Merge two Dataframes on index of both the dataframes

ResultsFinal = HealthIns_BK2.merge(Results, left_index=True, right_index=True)

# Display 10 records randomly

ResultsFinal.sample(5)
-----------------------------------------------------------------------------------
--------------------------------------------------
# Calculate the %of Error

ResultsFinal['%Error'] = round(((ResultsFinal['Expenses_A']-
ResultsFinal['Expenses_P'])/ResultsFinal['Expenses_A'])*100,3)

# Display the random 5 records

ResultsFinal.sample(5)
-----------------------------------------------------------------------------------
--------------------------------------------------

You might also like