DSBDA4
DSBDA4
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [27]: df.head()
Out[27]: CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 NaN 36.2
In [28]: df.tail()
Out[28]: CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
501 0.06263 0.0 11.93 0.0 0.573 6.593 69.1 2.4786 1 273 21.0 391.99 NaN 22.4
502 0.04527 0.0 11.93 0.0 0.573 6.120 76.7 2.2875 1 273 21.0 396.90 9.08 20.6
503 0.06076 0.0 11.93 0.0 0.573 6.976 91.0 2.1675 1 273 21.0 396.90 5.64 23.9
504 0.10959 0.0 11.93 0.0 0.573 6.794 89.3 2.3889 1 273 21.0 393.45 6.48 22.0
505 0.04741 0.0 11.93 0.0 0.573 6.030 NaN 2.5050 1 273 21.0 396.90 7.88 11.9
In [29]: df.describe()
Out[29]: CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO
count 486.000000 486.000000 486.000000 486.000000 506.000000 506.000000 486.000000 506.000000 506.000000 506.000000 506.000000
mean 3.611874 11.211934 11.083992 0.069959 0.554695 6.284634 68.518519 3.795043 9.549407 408.237154 18.455534
std 8.720192 23.388876 6.835896 0.255340 0.115878 0.702617 27.999513 2.105710 8.707259 168.537116 2.164946
min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 2.900000 1.129600 1.000000 187.000000 12.600000
25% 0.081900 0.000000 5.190000 0.000000 0.449000 5.885500 45.175000 2.100175 4.000000 279.000000 17.400000
50% 0.253715 0.000000 9.690000 0.000000 0.538000 6.208500 76.800000 3.207450 5.000000 330.000000 19.050000
75% 3.560263 12.500000 18.100000 0.000000 0.624000 6.623500 93.975000 5.188425 24.000000 666.000000 20.200000
max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 100.000000 12.126500 24.000000 711.000000 22.000000
In [30]: df.shape
In [31]: df.dtypes
In [32]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 CRIM 486 non-null float64
1 ZN 486 non-null float64
2 INDUS 486 non-null float64
3 CHAS 486 non-null float64
4 NOX 506 non-null float64
5 RM 506 non-null float64
6 AGE 486 non-null float64
7 DIS 506 non-null float64
8 RAD 506 non-null int64
9 TAX 506 non-null int64
10 PTRATIO 506 non-null float64
11 B 506 non-null float64
12 LSTAT 486 non-null float64
13 MEDV 506 non-null float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB
In [37]: df.isna().sum()
Out[37]: CRIM 0
ZN 0
INDUS 0
CHAS 0
NOX 0
RM 0
AGE 0
DIS 0
RAD 0
TAX 0
PTRATIO 0
B 0
LSTAT 0
MEDV 0
dtype: int64
CRIM 0
ZN 0
INDUS 0
CHAS 0
NOX 0
RM 0
AGE 0
DIS 0
RAD 0
TAX 0
PTRATIO 0
B 0
LSTAT 0
MEDV 0
dtype: int64
In [39]: x.head()
Out[39]: CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
0 0.00632 18.0 2.31 0.0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.980000
1 0.02731 0.0 7.07 0.0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.140000
2 0.02729 0.0 7.07 0.0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.030000
3 0.03237 0.0 2.18 0.0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.940000
4 0.06905 0.0 2.18 0.0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 12.715432
In [40]: y.head()
Out[40]: 0 24.0
1 21.6
2 34.7
3 33.4
4 36.2
Name: MEDV, dtype: float64
Out[48]: ▾ LinearRegression i ?
LinearRegression()
In [49]: train_score=round(regression.score(x_train,y_train)*100,2)
print('Train score of Linear Regression:',train_score)
In [52]: predictions
Out[55]: 57.03
In [58]: df.head(15)
Out[58]: CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV
0 0.00632 18.0 2.31 0.000000 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.980000 24.0
1 0.02731 0.0 7.07 0.000000 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.140000 21.6
2 0.02729 0.0 7.07 0.000000 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.030000 34.7
3 0.03237 0.0 2.18 0.000000 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.940000 33.4
4 0.06905 0.0 2.18 0.000000 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 12.715432 36.2
5 0.02985 0.0 2.18 0.000000 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.210000 28.7
6 0.08829 12.5 7.87 0.069959 0.524 6.012 66.6 5.5605 5 311 15.2 395.60 12.430000 22.9
7 0.14455 12.5 7.87 0.000000 0.524 6.172 96.1 5.9505 5 311 15.2 396.90 19.150000 27.1
8 0.21124 12.5 7.87 0.000000 0.524 5.631 100.0 6.0821 5 311 15.2 386.63 29.930000 16.5
9 0.17004 12.5 7.87 0.069959 0.524 6.004 85.9 6.5921 5 311 15.2 386.71 17.100000 18.9
10 0.22489 12.5 7.87 0.000000 0.524 6.377 94.3 6.3467 5 311 15.2 392.52 20.450000 15.0
11 0.11747 12.5 7.87 0.000000 0.524 6.009 82.9 6.2267 5 311 15.2 396.90 13.270000 18.9
12 0.09378 12.5 7.87 0.000000 0.524 5.889 39.0 5.4509 5 311 15.2 390.50 15.710000 21.7
13 0.62976 0.0 8.14 0.000000 0.538 5.949 61.8 4.7075 4 307 21.0 396.90 8.260000 20.4
14 0.63796 0.0 8.14 0.069959 0.538 6.096 84.5 4.4619 4 307 21.0 380.02 10.260000 18.2
In [59]: regression.predict([[0.62976,0.0,8.14,0.0,0.538,5.949,61.8,4.7075,4,307,21.0,396.60,8.26]])
In [61]: regression.intercept_
Out[61]: np.float64(35.040166029487466)
In [62]: regression.coef_
ax.bar(lr_coefficient["columns"],
lr_coefficient['Coefficient Estimate'])
ax.spines['bottom'].set_position('zero')
plt.style.use('ggplot')
plt.grid()
plt.show()
In [65]: fig, ax = plt.subplots(figsize =(20, 10))
color = ['tab:gray', 'tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:gray', 'tab:
ax.bar(lr_coefficient["columns"],
lr_coefficient['Coefficient Estimate'],color = color)
ax.spines['bottom'].set_position('zero')
plt.style.use('ggplot')
plt.show()
In [ ]: