0% found this document useful (0 votes)

57 views26 pages

02 End To End Machine Learning Project

Uploaded by

angelalexisanayaalamea314

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

57 views26 pages

02 End To End Machine Learning Project

Uploaded by

angelalexisanayaalamea314

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 26

# Python ≥3.

5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required

import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures

PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png",

resolution=300):
path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)

Get the Data

Download the Data
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-
ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL,
housing_path=HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()

fetch_housing_data()

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)

Take a Quick Look at the Data Structure

housing = load_housing_data()
housing.head()

longitude latitude housing_median_age total_rooms

total_bedrooms \
0 -122.23 37.88 41.0 880.0
129.0
1 -122.22 37.86 21.0 7099.0
1106.0
2 -122.24 37.85 52.0 1467.0
190.0
3 -122.25 37.85 52.0 1274.0
235.0
4 -122.25 37.85 52.0 1627.0
280.0

population households median_income median_house_value

ocean_proximity
0 322.0 126.0 8.3252 452600.0
NEAR BAY
1 2401.0 1138.0 8.3014 358500.0
NEAR BAY
2 496.0 177.0 7.2574 352100.0
NEAR BAY
3 558.0 219.0 5.6431 341300.0
NEAR BAY
4 565.0 259.0 3.8462 342200.0
NEAR BAY

housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 20640 non-null float64
1 latitude 20640 non-null float64
2 housing_median_age 20640 non-null float64
3 total_rooms 20640 non-null float64
4 total_bedrooms 20433 non-null float64
5 population 20640 non-null float64
6 households 20640 non-null float64
7 median_income 20640 non-null float64
8 median_house_value 20640 non-null float64
9 ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

housing["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN 9136
INLAND 6551
NEAR OCEAN 2658
NEAR BAY 2290
ISLAND 5
Name: count, dtype: int64

housing.describe()

longitude latitude housing_median_age total_rooms \

count 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081
std 2.003532 2.135952 12.585558 2181.615252
min -124.350000 32.540000 1.000000 2.000000
25% -121.800000 33.930000 18.000000 1447.750000
50% -118.490000 34.260000 29.000000 2127.000000
75% -118.010000 37.710000 37.000000 3148.000000
max -114.310000 41.950000 52.000000 39320.000000

total_bedrooms population households median_income \

count 20433.000000 20640.000000 20640.000000 20640.000000
mean 537.870553 1425.476744 499.539680 3.870671
std 421.385070 1132.462122 382.329753 1.899822
min 1.000000 3.000000 1.000000 0.499900
25% 296.000000 787.000000 280.000000 2.563400
50% 435.000000 1166.000000 409.000000 3.534800
75% 647.000000 1725.000000 605.000000 4.743250
max 6445.000000 35682.000000 6082.000000 15.000100
median_house_value
count 20640.000000
mean 206855.816909
std 115395.615874
min 14999.000000
25% 119600.000000
50% 179700.000000
75% 264725.000000
max 500001.000000

%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

Saving figure attribute_histogram_plots

Create a Test Set
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2,

random_state=42)

test_set.head()

longitude latitude housing_median_age total_rooms

total_bedrooms \
20046 -119.01 36.06 25.0 1505.0
NaN
3024 -119.46 35.14 30.0 2943.0
NaN
15663 -122.44 37.80 52.0 3830.0
NaN
20484 -118.72 34.28 17.0 3051.0
NaN
9814 -121.93 36.62 34.0 2351.0
NaN

population households median_income median_house_value \

20046 1392.0 359.0 1.6812 47700.0
3024 1565.0 584.0 2.5313 45800.0
15663 1310.0 963.0 3.4801 500001.0
20484 1705.0 495.0 5.7376 218600.0
9814 1063.0 428.0 3.7250 278000.0

ocean_proximity
20046 INLAND
3024 INLAND
15663 NEAR BAY
20484 <1H OCEAN
9814 NEAR OCEAN

housing["median_income"].hist()

<Axes: >
housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])

housing["income_cat"].value_counts()

income_cat
3 7236
2 6581
4 3639
5 2362
1 822
Name: count, dtype: int64

housing["income_cat"].hist()

<Axes: >
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,

random_state=42)
for train_index, test_index in split.split(housing,
housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]

strat_test_set["income_cat"].value_counts() / len(strat_test_set)

income_cat
3 0.350533
2 0.318798
4 0.176357
5 0.114341
1 0.039971
Name: count, dtype: float64

housing["income_cat"].value_counts() / len(housing)

income_cat
3 0.350581
2 0.318847
4 0.176308
5 0.114438
1 0.039826
Name: count, dtype: float64

def income_cat_proportions(data):
return data["income_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2,

random_state=42)

compare_props = pd.DataFrame({
"Overall": income_cat_proportions(housing),
"Stratified": income_cat_proportions(strat_test_set),
"Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] /
compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] /
compare_props["Overall"] - 100

compare_props

Overall Stratified Random Rand. %error Strat.

%error
income_cat

1 0.039826 0.039971 0.040213 0.973236

0.364964
2 0.318847 0.318798 0.324370 1.732260 -
0.015195
3 0.350581 0.350533 0.358527 2.266446 -
0.013820
4 0.176308 0.176357 0.167393 -5.056334
0.027480
5 0.114438 0.114341 0.109496 -4.318374 -
0.084674

for set_ in (strat_train_set, strat_test_set):

set_.drop("income_cat", axis=1, inplace=True)

Discover and Visualize the Data to Gain Insights

housing = strat_train_set.copy()

Visualizing Geographical Data

housing.plot(kind="scatter", x="longitude", y="latitude")
save_fig("bad_visualization_plot")

Saving figure bad_visualization_plot

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
save_fig("better_visualization_plot")

Saving figure better_visualization_plot

housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population",
figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"),
colorbar=True,
sharex=False)
plt.legend()
save_fig("housing_prices_scatterplot")

Saving figure housing_prices_scatterplot

# Download the California image
images_path = os.path.join(PROJECT_ROOT_DIR, "images",
"end_to_end_project")
os.makedirs(images_path, exist_ok=True)
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-
ml2/master/"
filename = "california.png"
print("Downloading", filename)
url = DOWNLOAD_ROOT + "images/end_to_end_project/" + filename
urllib.request.urlretrieve(url, os.path.join(images_path, filename))

Downloading california.png

('.\\images\\end_to_end_project\\california.png',
<http.client.HTTPMessage at 0x25e58fecd10>)

import matplotlib.image as mpimg

california_img=mpimg.imread(os.path.join(images_path, filename))

ax = housing.plot(kind="scatter", x="longitude", y="latitude",

figsize=(10,7),
s=housing['population']/100, label="Population",
c="median_house_value", cmap=plt.get_cmap("jet"),
colorbar=False, alpha=0.4)

plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05],

alpha=0.5,
cmap=plt.get_cmap("jet"))

plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)

prices = housing["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar(ticks=tick_values/prices.max())
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values],
fontsize=14)
cbar.set_label('Median House Value', fontsize=16)

plt.legend(fontsize=16)
save_fig("california_housing_prices_plot")
plt.show()

Saving figure california_housing_prices_plot

Looking for Correlations
housing_original = housing.copy()

housing = housing_original[['longitude', 'latitude',

'housing_median_age', 'total_rooms',
'total_bedrooms', 'population', 'households', 'median_income',
'median_house_value']]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value 1.000000
median_income 0.687151
total_rooms 0.135140
housing_median_age 0.114146
households 0.064590
total_bedrooms 0.047781
population -0.026882
longitude -0.047466
latitude -0.142673
Name: median_house_value, dtype: float64

# from pandas.tools.plotting import scatter_matrix # For older

versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",

"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
save_fig("scatter_matrix_plot")

Saving figure scatter_matrix_plot

housing.plot(kind="scatter", x="median_income",
y="median_house_value",
alpha=0.1)
plt.axis([0, 16, 0, 550000])
save_fig("income_vs_house_value_scatterplot")

Saving figure income_vs_house_value_scatterplot

Experimenting with Attribute Combinations
housing["rooms_per_household"] =
housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] =
housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["hou
seholds"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value 1.000000
median_income 0.687151
rooms_per_household 0.146255
total_rooms 0.135140
housing_median_age 0.114146
households 0.064590
total_bedrooms 0.047781
population_per_household -0.021991
population -0.026882
longitude -0.047466
latitude -0.142673
bedrooms_per_room -0.259952
Name: median_house_value, dtype: float64

housing.plot(kind="scatter", x="rooms_per_household",
y="median_house_value",
alpha=0.2)
plt.axis([0, 5, 0, 520000])
plt.show()

housing.describe()

longitude latitude housing_median_age total_rooms \

count 16512.000000 16512.000000 16512.000000 16512.000000
mean -119.575635 35.639314 28.653404 2622.539789
std 2.001828 2.137963 12.574819 2138.417080
min -124.350000 32.540000 1.000000 6.000000
25% -121.800000 33.940000 18.000000 1443.000000
50% -118.510000 34.260000 29.000000 2119.000000
75% -118.010000 37.720000 37.000000 3141.000000
max -114.310000 41.950000 52.000000 39320.000000

total_bedrooms population households median_income \

count 16354.000000 16512.000000 16512.000000 16512.000000
mean 534.914639 1419.687379 497.011810 3.875884
std 412.665649 1115.663036 375.696156 1.904931
min 2.000000 3.000000 2.000000 0.499900
25% 295.000000 784.000000 279.000000 2.566950
50% 433.000000 1164.000000 408.000000 3.541550
75% 644.000000 1719.000000 602.000000 4.745325
max 6210.000000 35682.000000 5358.000000 15.000100

median_house_value rooms_per_household bedrooms_per_room \

count 16512.000000 16512.000000 16354.000000
mean 207005.322372 5.440406 0.212873
std 115701.297250 2.611696 0.057378
min 14999.000000 1.130435 0.100000
25% 119800.000000 4.442168 0.175304
50% 179500.000000 5.232342 0.203027
75% 263900.000000 6.056361 0.239816
max 500001.000000 141.909091 1.000000

population_per_household
count 16512.000000
mean 3.096469
std 11.584825
min 0.692308
25% 2.431352
50% 2.817661
75% 3.281420
max 1243.333333

Prepare the Data for Machine Learning Algorithms

housing = strat_train_set.drop("median_house_value", axis=1) # drop
labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

Data Cleaning
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

longitude latitude housing_median_age total_rooms

total_bedrooms \
1606 -122.08 37.88 26.0 2947.0
NaN
10915 -117.87 33.73 45.0 2264.0
NaN
19150 -122.70 38.35 14.0 2313.0
NaN
4186 -118.23 34.13 48.0 1308.0
NaN
16885 -122.40 37.58 26.0 3281.0
NaN

population households median_income ocean_proximity

1606 825.0 626.0 2.9330 NEAR BAY
10915 1970.0 499.0 3.4193 <1H OCEAN
19150 954.0 397.0 3.7813 <1H OCEAN
4186 835.0 294.0 4.2891 <1H OCEAN
16885 1145.0 480.0 6.3580 NEAR OCEAN

median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True)
# option 3

C:\Users\Angel Anaya\AppData\Local\Temp\
ipykernel_27768\760120979.py:2: FutureWarning: A value is trying to be
set on a copy of a DataFrame or Series through chained assignment
using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never
work because the intermediate object on which we are setting values
always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try

using 'df.method({col: value}, inplace=True)' or df[col] =
df[col].method(value) instead, to perform the operation inplace on the
original object.

sample_incomplete_rows["total_bedrooms"].fillna(median,
inplace=True) # option 3

sample_incomplete_rows

longitude latitude housing_median_age total_rooms

total_bedrooms \
1606 -122.08 37.88 26.0 2947.0
433.0
10915 -117.87 33.73 45.0 2264.0
433.0
19150 -122.70 38.35 14.0 2313.0
433.0
4186 -118.23 34.13 48.0 1308.0
433.0
16885 -122.40 37.58 26.0 3281.0
433.0

population households median_income ocean_proximity

1606 825.0 626.0 2.9330 NEAR BAY
10915 1970.0 499.0 3.4193 <1H OCEAN
19150 954.0 397.0 3.7813 <1H OCEAN
4186 835.0 294.0 4.2891 <1H OCEAN
16885 1145.0 480.0 6.3580 NEAR OCEAN

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

housing_num = housing.drop("ocean_proximity", axis=1)

imputer.fit(housing_num)

SimpleImputer(strategy='median')

imputer.statistics_

array([-118.51 , 34.26 , 29. , 2119. , 433. ,

1164. , 408. , 3.54155])

housing_num.median().values

array([-118.51 , 34.26 , 29. , 2119. , 433. ,

1164. , 408. , 3.54155])

X = imputer.transform(housing_num)

housing_tr = pd.DataFrame(X, columns=housing_num.columns,

index=housing.index)

housing_tr.loc[sample_incomplete_rows.index.values]

longitude latitude housing_median_age total_rooms

population households median_income

1606 825.0 626.0 2.9330
10915 1970.0 499.0 3.4193
19150 954.0 397.0 3.7813
4186 835.0 294.0 4.2891
16885 1145.0 480.0 6.3580

imputer.strategy

'median'
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index=housing_num.index)

housing_tr.head()

longitude latitude housing_median_age total_rooms

total_bedrooms \
12655 -121.46 38.52 29.0 3873.0
797.0
15502 -117.23 33.09 7.0 5320.0
855.0
2908 -119.04 35.37 44.0 1618.0
310.0
14053 -117.13 32.75 24.0 1877.0
519.0
20496 -118.70 34.28 27.0 3536.0
646.0

population households median_income

12655 2237.0 706.0 2.1736
15502 2015.0 768.0 6.3373
2908 667.0 300.0 2.8750
14053 898.0 483.0 2.2264
20496 1837.0 580.0 4.4964

Handling Text and Categorical Attributes

housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

ocean_proximity
12655 INLAND
15502 NEAR OCEAN
2908 INLAND
14053 NEAR OCEAN
20496 <1H OCEAN
1481 NEAR BAY
18125 <1H OCEAN
5830 <1H OCEAN
17989 <1H OCEAN
4861 <1H OCEAN

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

array([[1.],
[4.],
[1.],
[4.],
[0.],
[3.],
[0.],
[0.],
[0.],
[0.]])

ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],

dtype=object)]

from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'

with 16512 stored elements in Compressed Sparse Row format>

housing_cat_1hot.toarray()

array([[0., 1., 0., 0., 0.],

[0., 0., 0., 0., 1.],
[0., 1., 0., 0., 0.],
...,
[1., 0., 0., 0., 0.],
[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.]])

cat_encoder = OneHotEncoder(sparse_output=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

array([[0., 1., 0., 0., 0.],

[0., 0., 0., 0., 1.],
[0., 1., 0., 0., 0.],
...,
[1., 0., 0., 0., 0.],
[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.]])

Custom Transformers
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room=True): # no *args or
**kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
population_per_household = X[:, population_ix] / X[:,
households_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household,
population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household,
population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

col_names = "total_rooms", "total_bedrooms", "population",

"households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [
housing.columns.get_loc(c) for c in col_names] # get the column
indices

housing_extra_attribs = pd.DataFrame(
housing_extra_attribs,
columns=list(housing.columns)+["rooms_per_household",
"population_per_household"],
index=housing.index)
housing_extra_attribs.head()

longitude latitude housing_median_age total_rooms total_bedrooms

\
12655 -121.46 38.52 29.0 3873.0 797.0

15502 -117.23 33.09 7.0 5320.0 855.0

2908 -119.04 35.37 44.0 1618.0 310.0

14053 -117.13 32.75 24.0 1877.0 519.0

20496 -118.7 34.28 27.0 3536.0 646.0

population households median_income ocean_proximity

rooms_per_household \
12655 2237.0 706.0 2.1736 INLAND
5.485836
15502 2015.0 768.0 6.3373 NEAR OCEAN
6.927083
2908 667.0 300.0 2.875 INLAND
5.393333
14053 898.0 483.0 2.2264 NEAR OCEAN
3.886128
20496 1837.0 580.0 4.4964 <1H OCEAN
6.096552

population_per_household
12655 3.168555
15502 2.623698
2908 2.223333
14053 1.859213
20496 3.167241

Transformation Pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

housing_num_tr

array([[-0.94135046, 1.34743822, 0.02756357, ..., 0.01739526,

0.00622264, -0.12112176],
[ 1.17178212, -1.19243966, -1.72201763, ..., 0.56925554,
-0.04081077, -0.81086696],
[ 0.26758118, -0.1259716 , 1.22045984, ..., -0.01802432,
-0.07537122, -0.33827252],
...,
[-1.5707942 , 1.31001828, 1.53856552, ..., -0.5092404 ,
-0.03743619, 0.32286937],
[-1.56080303, 1.2492109 , -1.1653327 , ..., 0.32814891,
-0.05915604, -0.45702273],
[-1.28105026, 2.02567448, -0.13148926, ..., 0.01407228,
0.00657083, -0.12169672]])

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

housing_prepared

array([[-0.94135046, 1.34743822, 0.02756357, ..., 0. ,

0. , 0. ],
[ 1.17178212, -1.19243966, -1.72201763, ..., 0. ,
0. , 1. ],
[ 0.26758118, -0.1259716 , 1.22045984, ..., 0. ,
0. , 0. ],
...,
[-1.5707942 , 1.31001828, 1.53856552, ..., 0. ,
0. , 0. ],
[-1.56080303, 1.2492109 , -1.1653327 , ..., 0. ,
0. , 0. ],
[-1.28105026, 2.02567448, -0.13148926, ..., 0. ,
0. , 0. ]])

housing_prepared.shape

(16512, 16)

Select and Train a Model

Training and Evaluating on the Training Set
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

# let's try the full preprocessing pipeline on a few training

instances
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [ 85657.90192014 305492.60737488 152056.46122456

186095.70946094
244550.67966089]
print("Labels:", list(some_labels))

Labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]

some_data_prepared

array([[-0.94135046, 1.34743822, 0.02756357, 0.58477745,

0.64037127,
0.73260236, 0.55628602, -0.8936472 , 0.01739526,
0.00622264,
-0.12112176, 0. , 1. , 0. , 0.
,
0. ],
[ 1.17178212, -1.19243966, -1.72201763, 1.26146668,
0.78156132,
0.53361152, 0.72131799, 1.292168 , 0.56925554, -
0.04081077,
-0.81086696, 0. , 0. , 0. , 0.
,
1. ],
[ 0.26758118, -0.1259716 , 1.22045984, -0.46977281, -
0.54513828,
-0.67467519, -0.52440722, -0.52543365, -0.01802432, -
0.07537122,
-0.33827252, 0. , 1. , 0. , 0.
,
0. ],
[ 1.22173797, -1.35147437, -0.37006852, -0.34865152, -
0.03636724,
-0.46761716, -0.03729672, -0.86592882, -0.59513997, -
0.10680295,
0.96120521, 0. , 0. , 0. , 0.
,
1. ],
[ 0.43743108, -0.63581817, -0.13148926, 0.42717947,
0.27279028,
0.37406031, 0.22089846, 0.32575178, 0.2512412 ,
0.00610923,
-0.47451338, 1. , 0. , 0. , 0.
,
0. ]])

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print("root_mean_squared_error:", lin_rmse)

root_mean_squared_error: 68627.87390018745
from sklearn.metrics import mean_squared_error

lin_mse = mean_squared_error(housing_labels, housing_predictions)

print("mean_squared_error:", lin_mse)

mean_squared_error: 4709785076.060029

from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)

print("mean_absolute_error:", lin_mae)

mean_absolute_error: 49438.66860915801

Unit 1: Shobana T S Assistant Professor Dept. of ISE, BMSCE
No ratings yet
Unit 1: Shobana T S Assistant Professor Dept. of ISE, BMSCE
127 pages
Housing Prices Notebook
No ratings yet
Housing Prices Notebook
14 pages
House Price Prediction: # Importing Necessary Libraries
No ratings yet
House Price Prediction: # Importing Necessary Libraries
18 pages
Eda Project
No ratings yet
Eda Project
28 pages
ML Lab - BCSL606
No ratings yet
ML Lab - BCSL606
67 pages
Capstone Project Report
No ratings yet
Capstone Project Report
187 pages
MQL 4
100% (1)
MQL 4
1,665 pages
Example Project California Data Anaylsis Jupyter Notebook
No ratings yet
Example Project California Data Anaylsis Jupyter Notebook
28 pages
The Data Science Process
100% (1)
The Data Science Process
53 pages
Exit Exam From Ministry of Education
No ratings yet
Exit Exam From Ministry of Education
90 pages
Ex 2 TP1
No ratings yet
Ex 2 TP1
16 pages
HW 3
No ratings yet
HW 3
20 pages
House Price Prediction
No ratings yet
House Price Prediction
14 pages
PHP Notes BCA NEP Unit-2
No ratings yet
PHP Notes BCA NEP Unit-2
35 pages
Compte Rendu Data Visualisation
No ratings yet
Compte Rendu Data Visualisation
5 pages
Exp - 2-EDA - CaliforniaData Set - HeatMap - PairPlot-checkpoint - Jupyter Notebook
No ratings yet
Exp - 2-EDA - CaliforniaData Set - HeatMap - PairPlot-checkpoint - Jupyter Notebook
12 pages
IE0005 Exercise Solutions 2-6
No ratings yet
IE0005 Exercise Solutions 2-6
84 pages
Ex 1
No ratings yet
Ex 1
119 pages
Evan Marie Carr - Python and SKlearn
No ratings yet
Evan Marie Carr - Python and SKlearn
32 pages
Linear Regression Using Python
No ratings yet
Linear Regression Using Python
18 pages
Regression Algorithm
No ratings yet
Regression Algorithm
9 pages
Linear Regression Analysis - Polynomial Regression
No ratings yet
Linear Regression Analysis - Polynomial Regression
25 pages
Linear Regression With Python - Part 1
No ratings yet
Linear Regression With Python - Part 1
167 pages
Machine Learning Laboratory
No ratings yet
Machine Learning Laboratory
23 pages
Real Estate Price Prediction Model
No ratings yet
Real Estate Price Prediction Model
33 pages
DL 1
No ratings yet
DL 1
11 pages
MiniProject BI
No ratings yet
MiniProject BI
16 pages
Assignment 1
No ratings yet
Assignment 1
3 pages
Data Analysis With Python - Jupyter Notebook
No ratings yet
Data Analysis With Python - Jupyter Notebook
10 pages
Regression Analysis - Lasso and Ridge Regularization
No ratings yet
Regression Analysis - Lasso and Ridge Regularization
17 pages
C++ Programming - Yuan Dong, Fang Yang Li Zheng
No ratings yet
C++ Programming - Yuan Dong, Fang Yang Li Zheng
504 pages
Computer Science Syllabus RU
No ratings yet
Computer Science Syllabus RU
55 pages
ML Observation
No ratings yet
ML Observation
29 pages
FALLSEM2021-22 MDI4001 ETH VL2021220104135 Reference Material I 09-Aug-2021 Data2 1
No ratings yet
FALLSEM2021-22 MDI4001 ETH VL2021220104135 Reference Material I 09-Aug-2021 Data2 1
9 pages
BDA Section 3
No ratings yet
BDA Section 3
33 pages
Ads Exp5 Code
No ratings yet
Ads Exp5 Code
2 pages
TwinCAT 3 ADS INTRO EN
No ratings yet
TwinCAT 3 ADS INTRO EN
166 pages
Madhav PRGM File
No ratings yet
Madhav PRGM File
104 pages
Faseeh Chap 2 Report
No ratings yet
Faseeh Chap 2 Report
30 pages
Week 1 Get Familier With Jupyter Notebook
No ratings yet
Week 1 Get Familier With Jupyter Notebook
4 pages
ML Merged
No ratings yet
ML Merged
28 pages
DALab Part-B BCU&BU
No ratings yet
DALab Part-B BCU&BU
12 pages
Housing Main
No ratings yet
Housing Main
23 pages
Computer Science 1
No ratings yet
Computer Science 1
4 pages
Import As Import As From Import: "Mean Squared Errors: "
No ratings yet
Import As Import As From Import: "Mean Squared Errors: "
1 page
Emllab
No ratings yet
Emllab
6 pages
Project PDF
No ratings yet
Project PDF
13 pages
Tarea - Prediccion de Casas en California
No ratings yet
Tarea - Prediccion de Casas en California
5 pages
Normialization Dataset
No ratings yet
Normialization Dataset
7 pages
Financial Modeling 4th Edition PDF
No ratings yet
Financial Modeling 4th Edition PDF
35 pages
P04 The Regression Pipeline - Preprocessing Ans
No ratings yet
P04 The Regression Pipeline - Preprocessing Ans
19 pages
Exercise3 Solution
No ratings yet
Exercise3 Solution
19 pages
Boston Housing Analysis
No ratings yet
Boston Housing Analysis
3 pages
California Housing Project
No ratings yet
California Housing Project
5 pages
Setup: Chapter 2 - End-To-End Machine Learning Project
No ratings yet
Setup: Chapter 2 - End-To-End Machine Learning Project
31 pages
Lecture 08 Adapters
No ratings yet
Lecture 08 Adapters
48 pages
House Price Prediction Models
No ratings yet
House Price Prediction Models
16 pages
Exercise2 Solution
No ratings yet
Exercise2 Solution
15 pages
C and CPP by Nachiketa
No ratings yet
C and CPP by Nachiketa
69 pages
Python Assignment 1.ipynb - Colaboratory
No ratings yet
Python Assignment 1.ipynb - Colaboratory
3 pages
2019 Winter Model Answer Paper (Msbte Study Resources)
No ratings yet
2019 Winter Model Answer Paper (Msbte Study Resources)
24 pages
Bca Sem 2 Syllabus
No ratings yet
Bca Sem 2 Syllabus
9 pages
Quantam - Learning - Colaboratory
No ratings yet
Quantam - Learning - Colaboratory
13 pages
Predicting Home Prices in Bangalore
No ratings yet
Predicting Home Prices in Bangalore
18 pages
Assignement 4
No ratings yet
Assignement 4
6 pages
Project 4 - House Price Prediction - Ipynb - Colab
No ratings yet
Project 4 - House Price Prediction - Ipynb - Colab
5 pages
Boston Housing Solutions
No ratings yet
Boston Housing Solutions
3 pages
California Housing Price Prediction .
No ratings yet
California Housing Price Prediction .
1 page
Programming in C QB
No ratings yet
Programming in C QB
20 pages
Exam Practise Booklet - Unit 2
No ratings yet
Exam Practise Booklet - Unit 2
45 pages
Week 12
No ratings yet
Week 12
2 pages
Wt&ds Sem-1 Sppu Slips
No ratings yet
Wt&ds Sem-1 Sppu Slips
30 pages
Data Structures: Hapter
No ratings yet
Data Structures: Hapter
36 pages
Animesh Mani (OOP)
No ratings yet
Animesh Mani (OOP)
24 pages
Kaggle Machine Learning
No ratings yet
Kaggle Machine Learning
6 pages
MiniGo Spec
No ratings yet
MiniGo Spec
24 pages
Julia Arrays
No ratings yet
Julia Arrays
20 pages
Cambridge International AS & A Level: Computer Science 9618/21
No ratings yet
Cambridge International AS & A Level: Computer Science 9618/21
20 pages
Prac - 8 (1) - Jupyter Notebook
No ratings yet
Prac - 8 (1) - Jupyter Notebook
6 pages
Introduction To Machine Learning (ML) With Sklearn
No ratings yet
Introduction To Machine Learning (ML) With Sklearn
10 pages
Accelerating Fortran Codes: A Method For Integrating Coarray Fortran With Cuda Fortran and Openmp
No ratings yet
Accelerating Fortran Codes: A Method For Integrating Coarray Fortran With Cuda Fortran and Openmp
16 pages
12 CS EM Public Answer Key May 2022
No ratings yet
12 CS EM Public Answer Key May 2022
10 pages
5th and 6th Topic
No ratings yet
5th and 6th Topic
8 pages
House Price Prediction
No ratings yet
House Price Prediction
1 page
Lujain Alg
No ratings yet
Lujain Alg
6 pages
JAVA Assignment 12
No ratings yet
JAVA Assignment 12
5 pages
Enhanced Embedded Systems Midterm Exam
No ratings yet
Enhanced Embedded Systems Midterm Exam
3 pages
Csb-Pps Assignment 1 2024-25
No ratings yet
Csb-Pps Assignment 1 2024-25
2 pages
OOPS C++ LAB List of Experiments
No ratings yet
OOPS C++ LAB List of Experiments
2 pages