Car Price Prediction
Car Price Prediction
2 Import Libraries
[ ]: import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import pylab
#Normalize Data
from sklearn.preprocessing import StandardScaler
#Splitting Data
from sklearn.model_selection import train_test_split,cross_val_score
#PCA
from sklearn.decomposition import PCA
#Model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
1
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
#Metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
#from sklearn.metrics._plot.roc_curve import auc
3 Load Dataset
[ ]: df = pd.read_csv("/content/drive/MyDrive/Datasets/CarPrice_Assignment.csv")
df.head()
highwaympg price
0 27 13495.0
1 27 16500.0
2 26 16500.0
3 30 13950.0
4 22 17450.0
[5 rows x 26 columns]
[ ]: df.shape
[ ]: (205, 26)
2
[ ]: df.columns
[ ]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 car_ID 205 non-null int64
1 symboling 205 non-null int64
2 CarName 205 non-null object
3 fueltype 205 non-null object
4 aspiration 205 non-null object
5 doornumber 205 non-null object
6 carbody 205 non-null object
7 drivewheel 205 non-null object
8 enginelocation 205 non-null object
9 wheelbase 205 non-null float64
10 carlength 205 non-null float64
11 carwidth 205 non-null float64
12 carheight 205 non-null float64
13 curbweight 205 non-null int64
14 enginetype 205 non-null object
15 cylindernumber 205 non-null object
16 enginesize 205 non-null int64
17 fuelsystem 205 non-null object
18 boreratio 205 non-null float64
19 stroke 205 non-null float64
20 compressionratio 205 non-null float64
21 horsepower 205 non-null int64
22 peakrpm 205 non-null int64
23 citympg 205 non-null int64
24 highwaympg 205 non-null int64
25 price 205 non-null float64
dtypes: float64(8), int64(8), object(10)
memory usage: 41.8+ KB
[ ]: df.describe()
3
[ ]: car_ID symboling wheelbase carlength carwidth carheight \
count 205.000000 205.000000 205.000000 205.000000 205.000000 205.000000
mean 103.000000 0.834146 98.756585 174.049268 65.907805 53.724878
std 59.322565 1.245307 6.021776 12.337289 2.145204 2.443522
min 1.000000 -2.000000 86.600000 141.100000 60.300000 47.800000
25% 52.000000 0.000000 94.500000 166.300000 64.100000 52.000000
50% 103.000000 1.000000 97.000000 173.200000 65.500000 54.100000
75% 154.000000 2.000000 102.400000 183.100000 66.900000 55.500000
max 205.000000 3.000000 120.900000 208.100000 72.300000 59.800000
[ ]: df["CarName"].value_counts()
[ ]: toyota corona 6
toyota corolla 6
peugeot 504 6
subaru dl 4
mitsubishi mirage g4 3
..
mazda glc 4 1
mazda rx2 coupe 1
maxda glc deluxe 1
maxda rx3 1
volvo 246 1
Name: CarName, Length: 147, dtype: int64
4
4 Univariate Analysis
[ ]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 car_ID 205 non-null int64
1 symboling 205 non-null int64
2 CarName 205 non-null object
3 fueltype 205 non-null object
4 aspiration 205 non-null object
5 doornumber 205 non-null object
6 carbody 205 non-null object
7 drivewheel 205 non-null object
8 enginelocation 205 non-null object
9 wheelbase 205 non-null float64
10 carlength 205 non-null float64
11 carwidth 205 non-null float64
12 carheight 205 non-null float64
13 curbweight 205 non-null int64
14 enginetype 205 non-null object
15 cylindernumber 205 non-null object
16 enginesize 205 non-null int64
17 fuelsystem 205 non-null object
18 boreratio 205 non-null float64
19 stroke 205 non-null float64
20 compressionratio 205 non-null float64
21 horsepower 205 non-null int64
22 peakrpm 205 non-null int64
23 citympg 205 non-null int64
24 highwaympg 205 non-null int64
25 price 205 non-null float64
dtypes: float64(8), int64(8), object(10)
memory usage: 41.8+ KB
Categorical Data
[ ]: plt.figure(figsize=(6,4))
sns.countplot(x='fueltype' , data=df ,palette='mako')
plt.xlabel('fueltype', fontsize=14)
plt.show()
5
[ ]: plt.figure(figsize=(6,4))
sns.countplot(x='aspiration' , data=df ,palette='mako')
plt.xlabel('aspiration', fontsize=14)
plt.show()
6
[ ]: plt.figure(figsize=(6,4))
sns.countplot(x='doornumber' , data=df ,palette='mako')
plt.xlabel('doornumber', fontsize=14)
plt.show()
7
[ ]: plt.figure(figsize=(6,4))
sns.countplot(x='carbody' , data=df ,palette='mako')
plt.xlabel('carbody', fontsize=14)
plt.show()
8
[ ]: plt.figure(figsize=(6,4))
sns.countplot(x='drivewheel' , data=df ,palette='mako')
plt.xlabel('drivewheel', fontsize=14)
plt.show()
9
[ ]: plt.figure(figsize=(6,4))
sns.countplot(x='enginelocation' , data=df ,palette='mako')
plt.xlabel('enginelocation', fontsize=14)
plt.show()
10
[ ]: plt.figure(figsize=(6,4))
sns.countplot(x='enginetype' , data=df ,palette='mako')
plt.xlabel('enginetype', fontsize=14)
plt.show()
11
[ ]: plt.figure(figsize=(6,4))
sns.countplot(x='cylindernumber' , data=df ,palette='mako')
plt.xlabel('cylindernumber', fontsize=14)
plt.show()
12
[ ]: plt.figure(figsize=(6,4))
sns.countplot(x='fuelsystem' , data=df ,palette='mako')
plt.xlabel('fuelsystem', fontsize=14)
plt.show()
13
Numerical Data
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["symboling"], kde=True, color="orange", bins=10)
<ipython-input-254-abac86350aed>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
14
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["wheelbase"], kde=True, color="orange", bins=10)
<ipython-input-255-c02d41bbd045>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
15
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["carlength"], kde=True, color="orange", bins=10)
<ipython-input-256-1ec7de8e8cab>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
16
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["carwidth"], kde=True, color="orange", bins=10)
<ipython-input-257-84f1c82220a3>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
17
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["carheight"], kde=True, color="orange", bins=10)
<ipython-input-258-3a91cc5beecd>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
18
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["curbweight"], kde=True, color="orange", bins=10)
<ipython-input-259-deab17bd35f5>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
19
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["enginesize"], kde=True, color="orange", bins=10)
<ipython-input-260-06ba97e738c6>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
20
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["boreratio"], kde=True, color="orange", bins=10)
<ipython-input-261-4bf018894454>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
21
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["stroke"], kde=True, color="orange", bins=10)
<ipython-input-262-1e3936c3f84c>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
22
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["compressionratio"], kde=True, color="orange", bins=10)
<ipython-input-263-bdf77303f0d5>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
23
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["horsepower"], kde=True, color="orange", bins=10)
<ipython-input-264-929ca17da2b3>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
24
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["peakrpm"], kde=True, color="orange", bins=10)
<ipython-input-265-5b6674878ec6>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
25
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["citympg"], kde=True, color="orange", bins=10)
<ipython-input-266-313140d11e4a>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
26
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["highwaympg"], kde=True, color="orange", bins=10)
<ipython-input-267-060fe6ea79fe>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
27
[ ]: sns.set(rc={"figure.figsize":(6,4)})
sns.distplot(df["price"], kde=True, color="orange", bins=10)
<ipython-input-268-24d7eb5af33d>:2: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
28
5 EDA (Exploratory Data Analysis)
Remove Duplicate
[ ]: df.duplicated().sum()
[ ]: 0
[ ]: car_ID 0
symboling 0
CarName 0
fueltype 0
aspiration 0
doornumber 0
carbody 0
drivewheel 0
enginelocation 0
wheelbase 0
carlength 0
carwidth 0
29
carheight 0
curbweight 0
enginetype 0
cylindernumber 0
enginesize 0
fuelsystem 0
boreratio 0
stroke 0
compressionratio 0
horsepower 0
peakrpm 0
citympg 0
highwaympg 0
price 0
dtype: int64
Removing Outlier
[ ]: #Check Outliers
num_cols = df.select_dtypes(include=['int64', 'float64'])
plt.figure(figsize=(20,10))
#num_cols.boxplot()
sns.boxplot(data=num_cols)
plt.show()
[ ]: def remove_outlier(col):
sorted(col)
Q1,Q3 = col.quantile([0.25,0.75])
30
IQR = Q3 - Q1
lower_range = Q1 - (1.5 * IQR)
upper_range = Q3 + (1.5 * IQR)
return lower_range,upper_range
[ ]: for i in num_cols.columns:
lower_range,upper_range = remove_outlier(df[i])
df[i] = np.where(df[i] > upper_range, upper_range, df[i])
df[i] = np.where(df[i] < lower_range, lower_range, df[i])
[ ]: #Check Outliers
num_cols = df.select_dtypes(include=['int64', 'float64'])
plt.figure(figsize=(25,10))
#num_cols.boxplot()
sns.boxplot(data=num_cols)
plt.show()
[ ]: plt.boxplot(df["compressionratio"])
plt.show()
31
Bivaraite Analysis
[ ]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 car_ID 205 non-null float64
1 symboling 205 non-null float64
2 CarName 205 non-null object
3 fueltype 205 non-null object
4 aspiration 205 non-null object
5 doornumber 205 non-null object
6 carbody 205 non-null object
7 drivewheel 205 non-null object
8 enginelocation 205 non-null object
9 wheelbase 205 non-null float64
10 carlength 205 non-null float64
11 carwidth 205 non-null float64
12 carheight 205 non-null float64
13 curbweight 205 non-null float64
14 enginetype 205 non-null object
15 cylindernumber 205 non-null object
16 enginesize 205 non-null float64
32
17 fuelsystem 205 non-null object
18 boreratio 205 non-null float64
19 stroke 205 non-null float64
20 compressionratio 205 non-null float64
21 horsepower 205 non-null float64
22 peakrpm 205 non-null float64
23 citympg 205 non-null float64
24 highwaympg 205 non-null float64
25 price 205 non-null float64
dtypes: float64(16), object(10)
memory usage: 41.8+ KB
[ ]: plt.figure(figsize=(8, 6))
sns.barplot(x='fueltype', y='price', data=df)
plt.title('price by fueltype')
plt.xlabel('fueltype')
plt.ylabel('Price')
plt.show()
33
[ ]: plt.figure(figsize=(8, 6))
sns.barplot(x='aspiration', y='price', data=df)
plt.title('price by aspiration')
plt.xlabel('aspiration')
plt.ylabel('Price')
plt.show()
[ ]: plt.figure(figsize=(8, 6))
sns.barplot(x='doornumber', y='price', data=df)
plt.title('price by doornumber')
plt.xlabel('doornumber')
plt.ylabel('Price')
plt.show()
34
[ ]: plt.figure(figsize=(8, 6))
sns.barplot(x='carbody', y='price', data=df)
plt.title('price by carbody')
plt.xlabel('carbody')
plt.ylabel('Price')
plt.show()
35
[ ]: plt.figure(figsize=(8, 6))
sns.barplot(x='drivewheel', y='price', data=df)
plt.title('price by drivewheel')
plt.xlabel('drivewheel')
plt.ylabel('Price')
plt.show()
36
[ ]: plt.figure(figsize=(8, 6))
sns.barplot(x='enginelocation', y='price', data=df)
plt.title('price by enginelocation')
plt.xlabel('enginelocation')
plt.ylabel('Price')
plt.show()
37
[ ]: plt.figure(figsize=(8, 6))
sns.barplot(x='enginetype', y='price', data=df)
plt.title('price by enginetype')
plt.xlabel('enginetype')
plt.ylabel('Price')
plt.show()
38
[ ]: plt.figure(figsize=(8, 6))
sns.barplot(x='cylindernumber', y='price', data=df)
plt.title('price by cylindernumber')
plt.xlabel('cylindernumber')
plt.ylabel('Price')
plt.show()
39
[ ]: plt.figure(figsize=(8, 6))
sns.barplot(x='fuelsystem', y='price', data=df)
plt.title('price by fuelsystem')
plt.xlabel('fuelsystem')
plt.ylabel('Price')
plt.show()
40
[ ]:
[ ]: plt.figure(figsize=(10,6))
sns.regplot(x="wheelbase", y="price", data=df)
plt.show()
41
[ ]: plt.figure(figsize=(10,6))
sns.regplot(x="carlength", y="price", data=df)
plt.show()
42
[ ]: plt.figure(figsize=(10,6))
sns.regplot(x="carheight", y="price", data=df)
plt.show()
[ ]: plt.figure(figsize=(10,6))
sns.regplot(x="curbweight", y="price", data=df)
plt.show()
43
[ ]: plt.figure(figsize=(10,6))
sns.regplot(x="boreratio", y="price", data=df)
plt.show()
44
[ ]: plt.figure(figsize=(10,6))
sns.regplot(x="stroke", y="price", data=df)
plt.show()
[ ]: plt.figure(figsize=(10,6))
sns.regplot(x="compressionratio", y="price", data=df)
plt.show()
45
[ ]: plt.figure(figsize=(10,6))
sns.regplot(x="horsepower", y="price", data=df)
plt.show()
46
[ ]: num_cols = df.select_dtypes(include=["int64","float64"])
def plots(num_cols, variable):
plt.figure(figsize=(15,6))
plt.subplot(1, 2, 1)
#num_cols[variable].hist()
sns.distplot(num_cols[variable], kde=True, bins=10)
plt.title(variable)
plt.subplot(1, 2, 2)
stats.probplot(num_cols[variable], dist="norm", plot=pylab)
plt.title(variable)
plt.show()
for i in num_cols.columns:
plots(num_cols, i)
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
47
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
48
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
49
`distplot` is a deprecated function and will be removed in seaborn v0.14.0.
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
50
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
51
`distplot` is a deprecated function and will be removed in seaborn v0.14.0.
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
52
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
53
`distplot` is a deprecated function and will be removed in seaborn v0.14.0.
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
54
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
55
`distplot` is a deprecated function and will be removed in seaborn v0.14.0.
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
56
<ipython-input-294-7af58d2ef5aa>:6: UserWarning:
Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).
For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
57
<ipython-input-295-50c0f90b2df7>:2: FutureWarning: The default value of
numeric_only in DataFrame.corr is deprecated. In a future version, it will
default to False. Select only valid columns or specify the value of numeric_only
to silence this warning.
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', ax=ax)
[ ]: <Axes: >
for i in Label:
df1[i] = le.fit_transform(df1[i])
df1.shape
[ ]: (205, 26)
[ ]: df1.head()
58
4 5.0 2.0 5 1 0 0 3
highwaympg price
0 27.0 13495.0
1 27.0 16500.0
2 26.0 16500.0
3 30.0 13950.0
4 22.0 17450.0
[5 rows x 26 columns]
6 Feature Engineering
Mutual Information
[ ]: X = df1.iloc[:,2:25]
Y = df1.iloc[:,-1]
[ ]: X.head()
59
enginesize fuelsystem boreratio stroke compressionratio horsepower \
0 130.0 5 3.47 2.68 9.0 111.0
1 130.0 5 3.47 2.68 9.0 111.0
2 152.0 5 2.68 3.47 9.0 154.0
3 109.0 5 3.19 3.40 10.0 102.0
4 136.0 5 3.19 3.40 8.0 115.0
[5 rows x 23 columns]
[ ]: mi_score1 = mutual_info_regression(X,Y)
mi_score1 = pd.Series(mi_score1)
mi_score1.index = X.columns
mi_score1.sort_values(ascending=True)
[ ]: enginelocation 0.000000
doornumber 0.003030
fueltype 0.053397
aspiration 0.080864
carbody 0.091125
enginetype 0.170395
peakrpm 0.241351
compressionratio 0.283008
carheight 0.317828
drivewheel 0.323140
cylindernumber 0.324994
CarName 0.342597
stroke 0.343310
boreratio 0.451031
fuelsystem 0.472714
carlength 0.555744
wheelbase 0.583592
carwidth 0.663454
citympg 0.739795
enginesize 0.822806
highwaympg 0.860714
curbweight 0.865915
horsepower 0.877749
dtype: float64
60
[ ]: mi_score1.sort_values(ascending=False).plot.bar(figsize=(20, 8))
[ ]: <Axes: >
61
peakrpm citympg highwaympg
0 5000.0 21.0 27.0
1 5000.0 21.0 27.0
2 5000.0 19.0 26.0
3 5500.0 24.0 30.0
4 5500.0 18.0 22.0
[5 rows x 23 columns]
[ ]: X.head()
[5 rows x 26 columns]
62
[ ]: mi_score1 = mutual_info_regression(X,Y)
mi_score1 = pd.Series(mi_score1)
mi_score1.index = X.columns
mi_score1.sort_values(ascending=True)
[ ]: doornumber 0.025253
enginelocation 0.041341
fueltype 0.045562
carbody 0.062457
aspiration 0.071195
enginetype 0.167651
peakrpm 0.253169
compressionratio 0.288358
carheight 0.300771
drivewheel 0.320837
cylindernumber 0.329567
CarName 0.342232
stroke 0.347834
fuelsystem 0.431367
boreratio 0.456181
carlength 0.542398
wheelbase 0.574436
carwidth 0.658359
citympg 0.740055
engine_displacement 0.771470
enginesize 0.806962
fuel_efficiency 0.810593
highwaympg 0.833873
curbweight 0.866269
horsepower 0.891800
quality_index 0.957998
dtype: float64
[ ]: mi_score1.sort_values(ascending=False).plot.bar(figsize=(20, 8))
[ ]: <Axes: >
63
Splitting Data into Train And Test
[ ]: train_data,test_data,train_label,test_label = train_test_split(X,Y,test_size=0.
↪3, random_state=0)
[ ]: print("train_data : ",train_data.shape)
print("train_label : ",train_label.shape)
print("test_data : ",test_data.shape)
print("test_label : ",test_label.shape)
[ ]: train_data_sc
64
…,
[ 0.2644312 , 0.36731544, 1.98268366, …, -0.01734802,
-1.09944943, 1.0278145 ],
[-0.69818812, 0.36731544, -0.5043669 , …, 2.54320994,
-1.75283651, 2.81585328],
[ 1.15300288, 0.36731544, -0.5043669 , …, 0.1567409 ,
-0.11936879, 0.73809393]])
PCA
[ ]: pc = PCA()
train_data_sc_pc = pc.fit_transform(train_data_sc)
test_data_sc_pc = pc.fit_transform(test_data_sc)
[ ]: explained_variance = pc.explained_variance_ratio_
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot or Cumulative Explained Variance Plot')
plt.grid(True)
plt.show()
65
[ ]: # Cumulative explained variance nikalen
cumulative_variance = np.cumsum(explained_variance)
[ ]: pc = PCA(n_components=12)
train_data_sc_pc = pc.fit_transform(train_data_sc)
test_data_sc_pc = pc.fit_transform(test_data_sc)
[ ]: explained_variance = pc.explained_variance_ratio_
print("Explained Variance Ratios:", explained_variance)
66
0.04482467 0.03313997 0.02969732 0.02431859 0.01736173 0.01370672]
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Scree Plot or Cumulative Explained Variance Plot')
plt.grid(True)
plt.show()
7 Model
Linear Regression
[ ]: model_lr = LinearRegression().fit(train_data_sc_pc,train_label).
↪fit(train_data_sc_pc,train_label)
67
[ ]: y_pred = model_lr.predict(test_data_sc_pc)
y_pred
[ ]: plt.figure(figsize=(10,6))
plt.scatter(test_label,y_pred)
plt.plot(test_label,test_label,'r')
[ ]: [<matplotlib.lines.Line2D at 0x7f67f2210370>]
68
Ridge Model
[ ]: model_ridge = Ridge(alpha= 1.0).fit(train_data_sc_pc,train_label)
[ ]: y_pred_4 = model_ridge.predict(test_data_sc_pc)
[ ]: plt.figure(figsize=(10,6))
plt.scatter(test_label,y_pred_4)
plt.plot(test_label,test_label,'r')
69
[ ]: [<matplotlib.lines.Line2D at 0x7f67e5340130>]
[ ]: model_rf.score(train_data_sc_pc,train_label)
[ ]: 0.9849200112110187
[ ]: y_pred_2 = model_rf.predict(test_data_sc_pc)
70
Train Data Cross_val_score : 0.8665114703003836
Test Data Cross_val_score : 0.7398572840313276
[ ]: plt.figure(figsize=(10,6))
plt.scatter(test_label,y_pred_2)
plt.plot(test_label,test_label,'r')
[ ]: [<matplotlib.lines.Line2D at 0x7f67f1f48a90>]
[ ]: y_pred_3 = model_tree.predict(test_data_sc_pc)
71
print("Test Data Cross_val_score :␣
↪",cross_val_score(model_tree,test_data_sc_pc,test_label,cv=5).mean())
[ ]: plt.figure(figsize=(10,6))
plt.scatter(test_label,y_pred_2)
plt.plot(test_label,test_label,'r')
[ ]: [<matplotlib.lines.Line2D at 0x7f67ea0af2e0>]
72