California Housing Dataset
California Housing Dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.datasets import fetch_california_housing
sns.set()
%matplotlib inline
A:\Anaconda\lib\site-packages\scipy\__init__.py:155: UserWarning: A
NumPy version >=1.18.5 and <1.25.0 is required for this version of
SciPy (detected version 1.26.3
warnings.warn(f"A NumPy version >={np_minversion} and
<{np_maxversion}"
Loading Data
house = fetch_california_housing(data_home=None,
download_if_missing=True, return_X_y=False, as_frame=True)
df = house.data
df['MedHouseValue'] = house.target
df.head()
Summary Statistics
df.describe()
MedInc VS MedHouseValue
px.scatter(df,x='MedInc',y='MedHouseValue')
Housing Value based on Location
px.scatter(df, x="Longitude",y="Latitude",
color='MedHouseValue',size='Population')
import folium
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import branca.colormap as cm
# Create a colormap
colormap = plt.cm.viridis
folium.CircleMarker([row['Latitude'], row['Longitude']],
radius=5, # Size of the marker
fill=True,
fill_color=color,
color=None,
fill_opacity=0.7).add_to(map_california)
MedInc VS MedHouseValue
px.scatter(chunk,x='MedInc',y='MedHouseValue')
AveRooms VS MedHouseValue
px.scatter(chunk,x='AveRooms',y='MedHouseValue')
Heatmap
cor = df.corr()
sns.heatmap(cor, annot=True,fmt='.2f')
Machine Learning
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
X =
df.drop(['MedHouseValue','Population','AveOccup','AveBedrms'],axis=1)
Y = df[['MedHouseValue']]
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.4,
random_state=42)
model = LinearRegression().fit(xtrain,ytrain)
pred = model.predict(xtest)