Real Estate Price Prediction Model
Real Estate Price Prediction Model
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)
df1 = pd.read_csv(r"C:\Users\hp\Downloads\Bengaluru_House_Data.csv")
df1.head()
Data Cleaning:
df1.groupby('area_type')['area_type'].agg('count')
area_type
Built-up Area 2418
Carpet Area 87
Plot Area 2025
Super built-up Area 8790
Name: area_type, dtype: int64
df2 = df1.drop(['area_type','society','balcony','availability'],axis =
'columns')
df2.head()
location 1
size 16
total_sqft 0
bath 73
price 0
dtype: int64
df3 = df2.dropna()
df3.isnull().sum()
location 0
size 0
total_sqft 0
bath 0
price 0
dtype: int64
df3.shape
(13246, 5)
C:\Users\hp\AppData\Local\Temp\ipykernel_10596\945158270.py:1:
SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
df3['bhk'].unique()
df3[df3.bhk>20]
location size total_sqft bath price
bhk
1718 2Electronic City Phase II 27 BHK 8000 27.0 230.0
27
4684 Munnekollal 43 Bedroom 2400 40.0 660.0
43
df3.total_sqft.unique()
def is_float(x):
try:
float(x)
except:
return False
return True
df3[~df3['total_sqft'].apply(is_float)].head(10)
def sqft_to_num(x):
tokens = x.split('-')
if len(tokens) == 2:
return(float(tokens[0]) + float(tokens[1])) / 2
try:
return float(x)
except:
return None
sqft_to_num('1230-2342')
1786.0
df4 = df3.copy()
df4['total_sqft'] = df4['total_sqft'].apply(sqft_to_num)
df4.head()
location size total_sqft bath price bhk
0 Electronic City Phase II 2 BHK 1056.0 2.0 39.07 2
1 Chikka Tirupathi 4 Bedroom 2600.0 5.0 120.00 4
2 Uttarahalli 3 BHK 1440.0 2.0 62.00 3
3 Lingadheeranahalli 3 BHK 1521.0 3.0 95.00 3
4 Kothanur 2 BHK 1200.0 2.0 51.00 2
df4.loc[30]
location Yelahanka
size 4 BHK
total_sqft 2475.0
bath 4.0
price 186.0
bhk 4
Name: 30, dtype: object
df4.head(3)
df5 = df4.copy()
df5['price_per_sqft'] = df5['price']*100000/df5['total_sqft']
df5.head()
price_per_sqft
0 3699.810606
1 4615.384615
2 4305.555556
3 6245.890861
4 4250.000000
len(df5.location.unique())
1304
df5.location = df5.location.apply(lambda x : x.strip())
location_stats = df5.groupby('location')
['location'].agg('count').sort_values(ascending = False)
location_stats
location
Whitefield 535
Sarjapur Road 392
Electronic City 304
Kanakpura Road 266
Thanisandra 236
...
1 Giri Nagar 1
Kanakapura Road, 1
Kanakapura main Road 1
Karnataka Shabarimala 1
whitefiled 1
Name: location, Length: 1293, dtype: int64
len(location_stats[location_stats<=10])
1052
location_stats_less_than_10 = location_stats[location_stats<=10]
location_stats_less_than_10
location
Basapura 10
1st Block Koramangala 10
Gunjur Palya 10
Kalkere 10
Sector 1 HSR Layout 10
..
1 Giri Nagar 1
Kanakapura Road, 1
Kanakapura main Road 1
Karnataka Shabarimala 1
whitefiled 1
Name: location, Length: 1052, dtype: int64
len(df5.location.unique())
1293
242
df5.head(10)
price_per_sqft
0 3699.810606
1 4615.384615
2 4305.555556
3 6245.890861
4 4250.000000
5 3247.863248
6 7467.057101
7 18181.818182
8 4828.244275
9 36274.509804
Outlier Removal :
df5[df5.total_sqft / df5.bhk<300].head()
price_per_sqft
9 36274.509804
45 33333.333333
58 10660.980810
68 6296.296296
70 20000.000000
df5.shape
(13246, 7)
(12502, 7)
df6.price_per_sqft.describe()
count 12456.000000
mean 6308.502826
std 4168.127339
min 267.829813
25% 4210.526316
50% 5294.117647
75% 6916.666667
max 176470.588235
Name: price_per_sqft, dtype: float64
def remove_pps_outliers(df):
df_out = pd.DataFrame()
for key, subdf in df.groupby('location'):
m = np.mean(subdf.price_per_sqft)
st = np.std(subdf.price_per_sqft)
reduced_df = subdf[(subdf.price_per_sqft > (m-st)) &
(subdf.price_per_sqft <= (m+st))]
df_out = pd.concat([df_out, reduced_df], ignore_index = True)
return df_out
df7 = remove_pps_outliers(df6)
df7.shape
(10241, 7)
def plot_scatter_chart(df,location):
bhk2 = df[(df.location==location) & (df.bhk==2)]
bhk3 = df[(df.location==location) & (df.bhk==3)]
matplotlib.rcParams['figure.figsize'] = (15,10)
plt.scatter(bhk2.total_sqft, bhk2.price, color = 'blue', label =
'2 BHK', s = 50)
plt.scatter(bhk3.total_sqft, bhk3.price, marker = '+', color =
'green', label = '3 BHK', s = 50)
plt.xlabel("Total Square Feet Area")
plt.ylabel("Price")
plt.title(location)
plt.legend()
plot_scatter_chart(df7,"Hebbal")
# We should also remove properties where for same location, the price
of (for example) 3 bedroom apartment is less than
# 2 bedroom apartment (with same sqft area). What we will do for given
location, we will build a dictionary of stats per bhk,i.e.
def remove_bhk_outliers(df):
exclude_indices = np.array([])
for location, location_df in df.groupby('location'):
bhk_stats = {}
for bhk, bhk_df in location_df.groupby('bhk'):
bhk_stats[bhk] = {
'mean' : np.mean(bhk_df.price_per_sqft),
'std' : np.std(bhk_df.price_per_sqft),
'count' : bhk_df.shape[0]
}
for bhk, bhk_df in location_df.groupby('bhk'):
stats = bhk_stats.get(bhk-1)
if stats and stats['count']>5:
exclude_indices = np.append(exclude_indices,
bhk_df[bhk_df.price_per_sqft<(stats['mean'])].index.values)
return df.drop(exclude_indices, axis = 'index')
df8 = remove_bhk_outliers(df7)7
df8.shape
(7329, 7)
plot_scatter_chart(df8,"Hebbal")
matplotlib.rcParams["figure.figsize"] = (20,10)
plt.hist(df8.price_per_sqft,rwidth=0.8)
plt.xlabel("Price Per Square Feet")
plt.ylabel("Count")
array([ 4., 3., 2., 5., 8., 1., 6., 7., 9., 12., 16., 13.])
df8[df8.bath>10]
plt.hist(df8.bath,rwidth=0.8)
plt.xlabel("Number of Bathrooms")
plt.ylabel("Counts")
df9 = df8[df8.bath<df8.bhk+2]
df9.shape
(7251, 7)
dummies = pd.get_dummies(df10.location).astype(int)
dummies
2nd Stage Nagarbhavi 5th Block Hbr Layout 5th Phase JP Nagar
\
0 0 0 0
1 0 0 0
2 0 0 0
3 0 0 0
4 0 0 0
10232 0 0 0
10233 0 0 0
10236 0 0 0
10237 0 0 0
10240 0 0 0
1st Phase JP Nagar 2nd Phase Judicial Layout 2nd Stage Nagarbhavi
\
0 0 0 0
1 0 0 0
2 0 0 0
df12 = df11.drop(['location'],axis='columns')
df12.head(2)
2nd Phase Judicial Layout 2nd Stage Nagarbhavi 5th Block Hbr
Layout \
0 0 0
0
1 0 0
0
2nd Phase Judicial Layout 2nd Stage Nagarbhavi 5th Block Hbr
Layout \
0 0 0
0
1 0 0
0
2 0 0
0
3 0 0
0
4 0 0
0
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 0 0 0 0
0 428.0
1 194.0
2 235.0
3 130.0
4 148.0
Name: price, dtype: float64
0.8452277697874324
scores = []
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\
_validation.py:378: FitFailedWarning:
10 fits failed out of a total of 20.
The score on these train-test partitions for these parameters will be
set to nan.
If these failures are not expected, you can try to debug them by
setting error_score='raise'.
warnings.warn(some_fits_failed_message, FitFailedWarning)
C:\ProgramData\anaconda3\Lib\site-packages\sklearn\model_selection\
_search.py:952: UserWarning: One or more of the test scores are non-
finite: [ nan nan 0.70935255 0.68931782]
warnings.warn(
model best_score \
0 linear_regression 0.819001
1 lasso 0.687436
2 decision_tree 0.709353
best_params
0 {'fit_intercept': False}
1 {'alpha': 2, 'selection': 'random'}
2 {'criterion': 'friedman_mse', 'splitter': 'best'}
def predict_price(location,sqft,bath,bhk):
loc_index = np.where(X.columns==location)[0][0]
x = np.zeros(len(X.columns))
x[0] = sqft
x[1] = bath
x[2] = bhk
if loc_index > 0:
x[loc_index] = 1
return lr_clf.predict([x])[0]
X.columns
C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py:439:
UserWarning: X does not have valid feature names, but LinearRegression
was fitted with feature names
warnings.warn(
83.49904677185246
predict_price('Indira Nagar',1000,2,2)
C:\ProgramData\anaconda3\Lib\site-packages\sklearn\base.py:439:
UserWarning: X does not have valid feature names, but LinearRegression
was fitted with feature names
warnings.warn(
181.2781548400676
import pickle
with open('bangalore_home_prices_model.pickle','wb') as f:
pickle.dump(lr_clf,f)
import json
columns = {
'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
f.write(json.dumps(columns))