Project Linear Regression
Project Linear Regression
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df = pd.read_csv('USA_Housing.csv')
In [3]:
df.head()
Out[3]:
Avg. Area Avg. Area Avg. Area Number Avg. Area Number Area
Price Address
Income House Age of Rooms of Bedrooms Population
9127 Elizabeth
2 61287.067179 5.865890 8.512727 5.13 36882.159400 1.058988e+06 Stravenue\nDanieltown,
WI 06482...
USS Barnett\nFPO AP
3 63345.240046 7.188236 5.586729 3.26 34310.242831 1.260617e+06
44820
USNS Raymond\nFPO
4 59982.197226 5.040555 7.839388 4.23 26354.109472 6.309435e+05
AE 09386
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Avg. Area Income 5000 non-null float64
1 Avg. Area House Age 5000 non-null float64
2 Avg. Area Number of Rooms 5000 non-null float64
3 Avg. Area Number of Bedrooms 5000 non-null float64
4 Area Population 5000 non-null float64
5 Price 5000 non-null float64
6 Address 5000 non-null object
dtypes: float64(6), object(1)
memory usage: 273.6+ KB
In [5]:
df.describe()
Out[5]:
Avg. Area Avg. Area House Avg. Area Number of Avg. Area Number of Area
Price
Income Age Rooms Bedrooms Population
In [6]:
df.columns
Out[6]:
Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
dtype='object')
In [7]:
#EDA
In [8]:
sns.pairplot(df)
Out[8]:
<seaborn.axisgrid.PairGrid at 0x7f10a32c6450>
In [9]:
sns.distplot(df['Price'])
In [10]:
sns.heatmap(df.corr(), annot = True)
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f109a4dba90>
In [11]:
X = df[['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
'Avg. Area Number of Bedrooms', 'Area Population']]
y = df['Price']
In [11]:
In [12]:
from sklearn.model_selection import train_test_split
In [13]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state =
101)
In [14]:
# X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.95, random_state
= 101)
In [15]:
#X_test
In [16]:
from sklearn.linear_model import LinearRegression
In [17]:
lm = LinearRegression()
In [18]:
lm.fit(X_train, y_train)
Out[18]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [19]:
#Model Evaluation
In [20]:
predictions = lm.predict(X_test)
In [21]:
predictions
Out[21]:
In [22]:
y_test
Out[22]:
1718 1.251689e+06
2511 8.730483e+05
345 1.696978e+06
2521 1.063964e+06
54 9.487883e+05
...
1776 1.489520e+06
4269 7.777336e+05
1661 1.515271e+05
2410 1.343824e+06
2302 1.906025e+06
Name: Price, Length: 2000, dtype: float64
In [23]:
plt.scatter(y_test, predictions)
Out[23]:
<matplotlib.collections.PathCollection at 0x7f1091e384d0>
In [24]:
sns.distplot((y_test-predictions), bins = 50)
In [25]:
lm.intercept_
Out[25]:
-2640159.796853739
In [26]:
coef_df = pd.DataFrame(lm.coef_, X.columns, columns = ['Coeff'])
In [27]:
coef_df
Out[27]:
Coeff
In [28]:
from sklearn import metrics
In [29]:
metrics.mean_absolute_error(y_test, predictions)
Out[29]:
82288.22251914928
In [30]:
metrics.mean_squared_error(y_test, predictions)#MSE
Out[30]:
10460958907.208244
In [31]:
np.sqrt(metrics.mean_squared_error(y_test, predictions)) #RMSE
Out[31]:
102278.82922290538
In [32]:
df.columns
Out[32]:
Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
'Avg. Area Number of Bedrooms', 'Area Population', 'Price', 'Address'],
dtype='object')
In [33]:
single_data = pd.DataFrame([
{'Avg. Area Income':65000,
'Avg. Area House Age':10,
'Avg. Area Number of Rooms':7,
'Avg. Area Number of Bedrooms':4,
'Area Population':100000
}
])
In [34]:
single_data
Out[34]:
Avg. Area Income Avg. Area House Age Avg. Area Number of Rooms Avg. Area Number of Bedrooms Area Population
0 65000 10 7 4 100000
In [35]:
lm.predict(single_data)
Out[35]:
array([2788568.88485117])
In [35]: