Example Project California Data Anaylsis Jupyter Notebook
Example Project California Data Anaylsis Jupyter Notebook
Import libraries
In [1]: import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
Read data
In [2]: file_path = '/california-housing-prices/housing.csv'
data = pd.read_csv(file_path)
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 1/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [3]: data.sample(20)
Out[3]: longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_
4000 -118.63 34.18 33.0 5252.0 760.0 2041.0 730.0 6.7977 389
18854 -122.45 41.28 15.0 2740.0 503.0 1188.0 445.0 3.4519 128
2332 -119.69 36.83 7.0 2075.0 353.0 1040.0 362.0 3.9943 100
11959 -117.44 33.90 23.0 4487.0 754.0 2609.0 778.0 4.2788 148
8882 -118.51 34.04 40.0 1382.0 167.0 483.0 178.0 11.7045 500
638 -122.15 37.72 31.0 1616.0 372.0 739.0 379.0 2.9097 210
15852 -122.43 37.74 52.0 2637.0 539.0 1159.0 497.0 3.8846 333
14529 -117.14 32.92 15.0 3242.0 595.0 1936.0 593.0 4.9706 184
16984 -122.30 37.56 35.0 1873.0 351.0 945.0 333.0 5.5184 274
3655 -118.43 34.22 36.0 1372.0 295.0 774.0 306.0 3.6618 187
9403 -122.53 37.88 25.0 4921.0 866.0 1913.0 834.0 6.8742 413
12112 -117.32 34.01 23.0 3021.0 527.0 1580.0 533.0 4.4063 129
20475 -118.75 34.26 24.0 2234.0 373.0 1325.0 383.0 5.4604 193
3872 -118.54 34.21 32.0 2593.0 566.0 1596.0 547.0 3.9886 199
20231 -119.27 34.26 23.0 3578.0 753.0 1455.0 649.0 4.1898 359
3670 -118.40 34.23 36.0 1643.0 349.0 1414.0 337.0 4.1181 172
7302 -118.19 33.99 40.0 1547.0 434.0 1930.0 427.0 3.3869 157
6505 -118.06 34.08 34.0 1197.0 260.0 942.0 245.0 3.4202 189
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 2/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [4]: data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 20640 non-null float64
1 latitude 20640 non-null float64
2 housing_median_age 20640 non-null float64
3 total_rooms 20640 non-null float64
4 total_bedrooms 20433 non-null float64
5 population 20640 non-null float64
6 households 20640 non-null float64
7 median_income 20640 non-null float64
8 median_house_value 20640 non-null float64
9 ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
In [5]: data.describe().round(2)
Out[5]: longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house
mean -119.57 35.63 28.64 2635.76 537.87 1425.48 499.54 3.87 206
std 2.00 2.14 12.59 2181.62 421.39 1132.46 382.33 1.90 115
25% -121.80 33.93 18.00 1447.75 296.00 787.00 280.00 2.56 119
50% -118.49 34.26 29.00 2127.00 435.00 1166.00 409.00 3.53 179
75% -118.01 37.71 37.00 3148.00 647.00 1725.00 605.00 4.74 264
max -114.31 41.95 52.00 39320.00 6445.00 35682.00 6082.00 15.00 500
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 3/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
Data cleaning
Out[6]: 0
Out[7]: longitude 0
latitude 0
housing_median_age 0
total_rooms 0
total_bedrooms 207
population 0
households 0
median_income 0
median_house_value 0
ocean_proximity 0
dtype: int64
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 4/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 5/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
Out[8]: total_bedrooms
280.0 55
331.0 51
345.0 50
343.0 49
393.0 49
328.0 48
348.0 48
394.0 48
272.0 47
309.0 47
314.0 46
322.0 46
399.0 46
295.0 46
317.0 46
313.0 45
290.0 45
346.0 45
340.0 45
287.0 45
388.0 45
284.0 45
291.0 45
294.0 44
269.0 44
390.0 44
312.0 44
460.0 44
300.0 44
361.0 44
365.0 44
398.0 43
335.0 43
416.0 43
254.0 43
289.0 43
369.0 43
373.0 43
428.0 43
292.0 42
339.0 42
315.0 42
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 6/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
458.0 42
360.0 42
308.0 42
282.0 42
358.0 41
325.0 41
410.0 41
347.0 41
Name: count, dtype: int64
In [9]: data.total_bedrooms.median()
Out[9]: 435.0
In [11]: data.isna().sum()
Out[11]: longitude 0
latitude 0
housing_median_age 0
total_rooms 0
total_bedrooms 0
population 0
households 0
median_income 0
median_house_value 0
ocean_proximity 0
dtype: int64
Preprocessing data
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 7/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [12]: data.ocean_proximity.value_counts()
Out[12]: ocean_proximity
<1H OCEAN 9136
INLAND 6551
NEAR OCEAN 2658
NEAR BAY 2290
ISLAND 5
Name: count, dtype: int64
In [13]: sns.histplot(data.ocean_proximity)
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 8/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [14]: data
Out[14]: longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 9/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [16]: data
Out[16]: longitude latitude housing_median_age total_rooms total_bedrooms population households median_income median_house_
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 10/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 11/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [18]: data[['total_rooms','median_house_value']].groupby(data['total_bedrooms']).value_counts().sort_values(ascendi
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 12/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [19]: data[['total_rooms','median_house_value']].value_counts().sort_values(ascending=False)[0:20]
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 13/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 14/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 15/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [23]: plt.figure(figsize=(10,7))
plt.scatter(x = dt['total_rooms'],y = dt['housing_median_age'])
plt.xlabel('total_rooms')
plt.ylabel('housing_median_age')
plt.show()
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 16/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
Conclusion
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 17/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [26]: data[['longitude','latitude','median_house_value','ocean_proximity']].value_counts().sort_values(ascending =
In [27]: data[['median_house_value']].groupby(data['ocean_proximity']).value_counts().sort_values(ascending=False)[0:2
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 18/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [28]: data[['median_house_value','housing_median_age']].groupby(data['ocean_proximity']).value_counts().sort_values
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 19/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
Out[29]: median_house_value
500001.0 965
137500.0 122
162500.0 117
112500.0 103
187500.0 93
225000.0 92
350000.0 79
87500.0 78
275000.0 65
150000.0 64
175000.0 63
100000.0 62
125000.0 56
67500.0 55
250000.0 47
200000.0 46
118800.0 39
450000.0 37
156300.0 35
212500.0 33
Name: count, dtype: int64
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 20/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [30]: data['housing_median_age'].groupby(data['median_house_value']).value_counts().iloc[0:20]
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 21/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
population data
In [31]: data['population'].value_counts()
Out[31]: population
891.0 25
761.0 24
1227.0 24
1052.0 24
850.0 24
..
2141.0 1
5546.0 1
3186.0 1
3590.0 1
6912.0 1
Name: count, Length: 3888, dtype: int64
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 22/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [32]: plt.figure(figsize=(15,10))
plt.hist(data['population'],bins=150)
plt.xlabel('population')
plt.xlim(0,17000)
plt.ylim(0,6000)
plt.show()
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 23/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [33]: data[['population','households']].groupby(data['median_house_value']).value_counts().sort_values(ascending =
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 24/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [34]: plt.figure(figsize=(10,5))
sns.heatmap(data.corr(), annot=True,linewidths=2)
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 25/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 26/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
In [37]: data.columns
ML model to de
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 27/28
20/10/2024, 14:07 california-data-anaylsis - Jupyter Notebook
localhost:8888/notebooks/Downloads/california-data-anaylsis.ipynb 28/28