mock_coding
December 2, 2020
[340]: import numpy as np
import csv
from sklearn.linear_model import LinearRegression
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
[341]: dataset = pd.read_csv('insurance-data.csv')
dataset.head()
[341]: age gender bmi childrens smoker region cost
0 19 female 27.900 0 yes southwest 16884.92400
1 18 male NaN 1 no southeast 1725.55230
2 28 male 33.000 3 no southeast 4449.46200
3 33 male 22.705 0 no northwest 21984.47061
4 32 male 28.880 0 no northwest 3866.85520
[342]: dataset.dropna(inplace=True)
[343]: dataset = pd.get_dummies(dataset, columns = [ 'gender', 'smoker', 'region'] )
[344]: dataset.head()
[344]: age bmi childrens cost gender_female gender_male smoker_no \
0 19 27.900 0 16884.92400 1 0 0
2 28 33.000 3 4449.46200 0 1 1
3 33 22.705 0 21984.47061 0 1 1
4 32 28.880 0 3866.85520 0 1 1
5 31 25.740 0 3756.62160 1 0 1
smoker_yes region_northeast region_northwest region_southeast \
0 1 0 0 0
2 0 0 0 1
3 0 0 1 0
4 0 0 1 0
5 0 0 0 1
1
region_southwest
0 1
2 0
3 0
4 0
5 0
[345]: dataset.dtypes
[345]: age int64
bmi float64
childrens int64
cost float64
gender_female uint8
gender_male uint8
smoker_no uint8
smoker_yes uint8
region_northeast uint8
region_northwest uint8
region_southeast uint8
region_southwest uint8
dtype: object
[346]: X = dataset.drop('cost', axis=1).values
y = dataset.loc[:,'cost'].values
print(X[:2])
print(y[:2])
[[19. 27.9 0. 1. 0. 0. 1. 0. 0. 0. 1. ]
[28. 33. 3. 0. 1. 1. 0. 0. 0. 1. 0. ]]
[16884.924 4449.462]
[347]: from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
[348]: from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)
[348]: LinearRegression()
[349]: y_pred = regressor.predict(X_test)
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
print(f"R^2: {regressor.score(X_test, y_test):.2f}")
RMSE: 6239.55
R^2: 0.74