0% found this document useful (0 votes)
38 views

02 End To End Machine Learning Project

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
38 views

02 End To End Machine Learning Project

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 26

# Python ≥3.

5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required


import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures


%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures


PROJECT_ROOT_DIR = "."
CHAPTER_ID = "end_to_end_project"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png",


resolution=300):
path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format=fig_extension, dpi=resolution)

Get the Data


Download the Data
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-
ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

def fetch_housing_data(housing_url=HOUSING_URL,
housing_path=HOUSING_PATH):
if not os.path.isdir(housing_path):
os.makedirs(housing_path)
tgz_path = os.path.join(housing_path, "housing.tgz")
urllib.request.urlretrieve(housing_url, tgz_path)
housing_tgz = tarfile.open(tgz_path)
housing_tgz.extractall(path=housing_path)
housing_tgz.close()

fetch_housing_data()

import pandas as pd

def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, "housing.csv")
return pd.read_csv(csv_path)

Take a Quick Look at the Data Structure


housing = load_housing_data()
housing.head()

longitude latitude housing_median_age total_rooms


total_bedrooms \
0 -122.23 37.88 41.0 880.0
129.0
1 -122.22 37.86 21.0 7099.0
1106.0
2 -122.24 37.85 52.0 1467.0
190.0
3 -122.25 37.85 52.0 1274.0
235.0
4 -122.25 37.85 52.0 1627.0
280.0

population households median_income median_house_value


ocean_proximity
0 322.0 126.0 8.3252 452600.0
NEAR BAY
1 2401.0 1138.0 8.3014 358500.0
NEAR BAY
2 496.0 177.0 7.2574 352100.0
NEAR BAY
3 558.0 219.0 5.6431 341300.0
NEAR BAY
4 565.0 259.0 3.8462 342200.0
NEAR BAY

housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 20640 non-null float64
1 latitude 20640 non-null float64
2 housing_median_age 20640 non-null float64
3 total_rooms 20640 non-null float64
4 total_bedrooms 20433 non-null float64
5 population 20640 non-null float64
6 households 20640 non-null float64
7 median_income 20640 non-null float64
8 median_house_value 20640 non-null float64
9 ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

housing["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN 9136
INLAND 6551
NEAR OCEAN 2658
NEAR BAY 2290
ISLAND 5
Name: count, dtype: int64

housing.describe()

longitude latitude housing_median_age total_rooms \


count 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081
std 2.003532 2.135952 12.585558 2181.615252
min -124.350000 32.540000 1.000000 2.000000
25% -121.800000 33.930000 18.000000 1447.750000
50% -118.490000 34.260000 29.000000 2127.000000
75% -118.010000 37.710000 37.000000 3148.000000
max -114.310000 41.950000 52.000000 39320.000000

total_bedrooms population households median_income \


count 20433.000000 20640.000000 20640.000000 20640.000000
mean 537.870553 1425.476744 499.539680 3.870671
std 421.385070 1132.462122 382.329753 1.899822
min 1.000000 3.000000 1.000000 0.499900
25% 296.000000 787.000000 280.000000 2.563400
50% 435.000000 1166.000000 409.000000 3.534800
75% 647.000000 1725.000000 605.000000 4.743250
max 6445.000000 35682.000000 6082.000000 15.000100
median_house_value
count 20640.000000
mean 206855.816909
std 115395.615874
min 14999.000000
25% 119600.000000
50% 179700.000000
75% 264725.000000
max 500001.000000

%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20,15))
save_fig("attribute_histogram_plots")
plt.show()

Saving figure attribute_histogram_plots


Create a Test Set
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2,


random_state=42)

test_set.head()

longitude latitude housing_median_age total_rooms


total_bedrooms \
20046 -119.01 36.06 25.0 1505.0
NaN
3024 -119.46 35.14 30.0 2943.0
NaN
15663 -122.44 37.80 52.0 3830.0
NaN
20484 -118.72 34.28 17.0 3051.0
NaN
9814 -121.93 36.62 34.0 2351.0
NaN

population households median_income median_house_value \


20046 1392.0 359.0 1.6812 47700.0
3024 1565.0 584.0 2.5313 45800.0
15663 1310.0 963.0 3.4801 500001.0
20484 1705.0 495.0 5.7376 218600.0
9814 1063.0 428.0 3.7250 278000.0

ocean_proximity
20046 INLAND
3024 INLAND
15663 NEAR BAY
20484 <1H OCEAN
9814 NEAR OCEAN

housing["median_income"].hist()

<Axes: >
housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])

housing["income_cat"].value_counts()

income_cat
3 7236
2 6581
4 3639
5 2362
1 822
Name: count, dtype: int64

housing["income_cat"].hist()

<Axes: >
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,


random_state=42)
for train_index, test_index in split.split(housing,
housing["income_cat"]):
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]

strat_test_set["income_cat"].value_counts() / len(strat_test_set)

income_cat
3 0.350533
2 0.318798
4 0.176357
5 0.114341
1 0.039971
Name: count, dtype: float64

housing["income_cat"].value_counts() / len(housing)

income_cat
3 0.350581
2 0.318847
4 0.176308
5 0.114438
1 0.039826
Name: count, dtype: float64

def income_cat_proportions(data):
return data["income_cat"].value_counts() / len(data)

train_set, test_set = train_test_split(housing, test_size=0.2,


random_state=42)

compare_props = pd.DataFrame({
"Overall": income_cat_proportions(housing),
"Stratified": income_cat_proportions(strat_test_set),
"Random": income_cat_proportions(test_set),
}).sort_index()
compare_props["Rand. %error"] = 100 * compare_props["Random"] /
compare_props["Overall"] - 100
compare_props["Strat. %error"] = 100 * compare_props["Stratified"] /
compare_props["Overall"] - 100

compare_props

Overall Stratified Random Rand. %error Strat.


%error
income_cat

1 0.039826 0.039971 0.040213 0.973236


0.364964
2 0.318847 0.318798 0.324370 1.732260 -
0.015195
3 0.350581 0.350533 0.358527 2.266446 -
0.013820
4 0.176308 0.176357 0.167393 -5.056334
0.027480
5 0.114438 0.114341 0.109496 -4.318374 -
0.084674

for set_ in (strat_train_set, strat_test_set):


set_.drop("income_cat", axis=1, inplace=True)

Discover and Visualize the Data to Gain Insights


housing = strat_train_set.copy()

Visualizing Geographical Data


housing.plot(kind="scatter", x="longitude", y="latitude")
save_fig("bad_visualization_plot")

Saving figure bad_visualization_plot


housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
save_fig("better_visualization_plot")

Saving figure better_visualization_plot


housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population",
figsize=(10,7),
c="median_house_value", cmap=plt.get_cmap("jet"),
colorbar=True,
sharex=False)
plt.legend()
save_fig("housing_prices_scatterplot")

Saving figure housing_prices_scatterplot


# Download the California image
images_path = os.path.join(PROJECT_ROOT_DIR, "images",
"end_to_end_project")
os.makedirs(images_path, exist_ok=True)
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-
ml2/master/"
filename = "california.png"
print("Downloading", filename)
url = DOWNLOAD_ROOT + "images/end_to_end_project/" + filename
urllib.request.urlretrieve(url, os.path.join(images_path, filename))

Downloading california.png

('.\\images\\end_to_end_project\\california.png',
<http.client.HTTPMessage at 0x25e58fecd10>)

import matplotlib.image as mpimg


california_img=mpimg.imread(os.path.join(images_path, filename))

ax = housing.plot(kind="scatter", x="longitude", y="latitude",


figsize=(10,7),
s=housing['population']/100, label="Population",
c="median_house_value", cmap=plt.get_cmap("jet"),
colorbar=False, alpha=0.4)

plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05],


alpha=0.5,
cmap=plt.get_cmap("jet"))

plt.ylabel("Latitude", fontsize=14)
plt.xlabel("Longitude", fontsize=14)

prices = housing["median_house_value"]
tick_values = np.linspace(prices.min(), prices.max(), 11)
cbar = plt.colorbar(ticks=tick_values/prices.max())
cbar.ax.set_yticklabels(["$%dk"%(round(v/1000)) for v in tick_values],
fontsize=14)
cbar.set_label('Median House Value', fontsize=16)

plt.legend(fontsize=16)
save_fig("california_housing_prices_plot")
plt.show()

Saving figure california_housing_prices_plot


Looking for Correlations
housing_original = housing.copy()

housing = housing_original[['longitude', 'latitude',


'housing_median_age', 'total_rooms',
'total_bedrooms', 'population', 'households', 'median_income',
'median_house_value']]
corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value 1.000000
median_income 0.687151
total_rooms 0.135140
housing_median_age 0.114146
households 0.064590
total_bedrooms 0.047781
population -0.026882
longitude -0.047466
latitude -0.142673
Name: median_house_value, dtype: float64

# from pandas.tools.plotting import scatter_matrix # For older


versions of Pandas
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",


"housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12, 8))
save_fig("scatter_matrix_plot")

Saving figure scatter_matrix_plot


housing.plot(kind="scatter", x="median_income",
y="median_house_value",
alpha=0.1)
plt.axis([0, 16, 0, 550000])
save_fig("income_vs_house_value_scatterplot")

Saving figure income_vs_house_value_scatterplot


Experimenting with Attribute Combinations
housing["rooms_per_household"] =
housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] =
housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["hou
seholds"]

corr_matrix = housing.corr()
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value 1.000000
median_income 0.687151
rooms_per_household 0.146255
total_rooms 0.135140
housing_median_age 0.114146
households 0.064590
total_bedrooms 0.047781
population_per_household -0.021991
population -0.026882
longitude -0.047466
latitude -0.142673
bedrooms_per_room -0.259952
Name: median_house_value, dtype: float64

housing.plot(kind="scatter", x="rooms_per_household",
y="median_house_value",
alpha=0.2)
plt.axis([0, 5, 0, 520000])
plt.show()

housing.describe()

longitude latitude housing_median_age total_rooms \


count 16512.000000 16512.000000 16512.000000 16512.000000
mean -119.575635 35.639314 28.653404 2622.539789
std 2.001828 2.137963 12.574819 2138.417080
min -124.350000 32.540000 1.000000 6.000000
25% -121.800000 33.940000 18.000000 1443.000000
50% -118.510000 34.260000 29.000000 2119.000000
75% -118.010000 37.720000 37.000000 3141.000000
max -114.310000 41.950000 52.000000 39320.000000

total_bedrooms population households median_income \


count 16354.000000 16512.000000 16512.000000 16512.000000
mean 534.914639 1419.687379 497.011810 3.875884
std 412.665649 1115.663036 375.696156 1.904931
min 2.000000 3.000000 2.000000 0.499900
25% 295.000000 784.000000 279.000000 2.566950
50% 433.000000 1164.000000 408.000000 3.541550
75% 644.000000 1719.000000 602.000000 4.745325
max 6210.000000 35682.000000 5358.000000 15.000100

median_house_value rooms_per_household bedrooms_per_room \


count 16512.000000 16512.000000 16354.000000
mean 207005.322372 5.440406 0.212873
std 115701.297250 2.611696 0.057378
min 14999.000000 1.130435 0.100000
25% 119800.000000 4.442168 0.175304
50% 179500.000000 5.232342 0.203027
75% 263900.000000 6.056361 0.239816
max 500001.000000 141.909091 1.000000

population_per_household
count 16512.000000
mean 3.096469
std 11.584825
min 0.692308
25% 2.431352
50% 2.817661
75% 3.281420
max 1243.333333

Prepare the Data for Machine Learning Algorithms


housing = strat_train_set.drop("median_house_value", axis=1) # drop
labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

Data Cleaning
sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()
sample_incomplete_rows

longitude latitude housing_median_age total_rooms


total_bedrooms \
1606 -122.08 37.88 26.0 2947.0
NaN
10915 -117.87 33.73 45.0 2264.0
NaN
19150 -122.70 38.35 14.0 2313.0
NaN
4186 -118.23 34.13 48.0 1308.0
NaN
16885 -122.40 37.58 26.0 3281.0
NaN

population households median_income ocean_proximity


1606 825.0 626.0 2.9330 NEAR BAY
10915 1970.0 499.0 3.4193 <1H OCEAN
19150 954.0 397.0 3.7813 <1H OCEAN
4186 835.0 294.0 4.2891 <1H OCEAN
16885 1145.0 480.0 6.3580 NEAR OCEAN

median = housing["total_bedrooms"].median()
sample_incomplete_rows["total_bedrooms"].fillna(median, inplace=True)
# option 3

C:\Users\Angel Anaya\AppData\Local\Temp\
ipykernel_27768\760120979.py:2: FutureWarning: A value is trying to be
set on a copy of a DataFrame or Series through chained assignment
using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never
work because the intermediate object on which we are setting values
always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try


using 'df.method({col: value}, inplace=True)' or df[col] =
df[col].method(value) instead, to perform the operation inplace on the
original object.

sample_incomplete_rows["total_bedrooms"].fillna(median,
inplace=True) # option 3

sample_incomplete_rows

longitude latitude housing_median_age total_rooms


total_bedrooms \
1606 -122.08 37.88 26.0 2947.0
433.0
10915 -117.87 33.73 45.0 2264.0
433.0
19150 -122.70 38.35 14.0 2313.0
433.0
4186 -118.23 34.13 48.0 1308.0
433.0
16885 -122.40 37.58 26.0 3281.0
433.0

population households median_income ocean_proximity


1606 825.0 626.0 2.9330 NEAR BAY
10915 1970.0 499.0 3.4193 <1H OCEAN
19150 954.0 397.0 3.7813 <1H OCEAN
4186 835.0 294.0 4.2891 <1H OCEAN
16885 1145.0 480.0 6.3580 NEAR OCEAN

from sklearn.impute import SimpleImputer


imputer = SimpleImputer(strategy="median")

housing_num = housing.drop("ocean_proximity", axis=1)

imputer.fit(housing_num)

SimpleImputer(strategy='median')

imputer.statistics_

array([-118.51 , 34.26 , 29. , 2119. , 433. ,


1164. , 408. , 3.54155])

housing_num.median().values

array([-118.51 , 34.26 , 29. , 2119. , 433. ,


1164. , 408. , 3.54155])

X = imputer.transform(housing_num)

housing_tr = pd.DataFrame(X, columns=housing_num.columns,


index=housing.index)

housing_tr.loc[sample_incomplete_rows.index.values]

longitude latitude housing_median_age total_rooms


total_bedrooms \
1606 -122.08 37.88 26.0 2947.0
433.0
10915 -117.87 33.73 45.0 2264.0
433.0
19150 -122.70 38.35 14.0 2313.0
433.0
4186 -118.23 34.13 48.0 1308.0
433.0
16885 -122.40 37.58 26.0 3281.0
433.0

population households median_income


1606 825.0 626.0 2.9330
10915 1970.0 499.0 3.4193
19150 954.0 397.0 3.7813
4186 835.0 294.0 4.2891
16885 1145.0 480.0 6.3580

imputer.strategy

'median'
housing_tr = pd.DataFrame(X, columns=housing_num.columns,
index=housing_num.index)

housing_tr.head()

longitude latitude housing_median_age total_rooms


total_bedrooms \
12655 -121.46 38.52 29.0 3873.0
797.0
15502 -117.23 33.09 7.0 5320.0
855.0
2908 -119.04 35.37 44.0 1618.0
310.0
14053 -117.13 32.75 24.0 1877.0
519.0
20496 -118.70 34.28 27.0 3536.0
646.0

population households median_income


12655 2237.0 706.0 2.1736
15502 2015.0 768.0 6.3373
2908 667.0 300.0 2.8750
14053 898.0 483.0 2.2264
20496 1837.0 580.0 4.4964

Handling Text and Categorical Attributes


housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

ocean_proximity
12655 INLAND
15502 NEAR OCEAN
2908 INLAND
14053 NEAR OCEAN
20496 <1H OCEAN
1481 NEAR BAY
18125 <1H OCEAN
5830 <1H OCEAN
17989 <1H OCEAN
4861 <1H OCEAN

from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

array([[1.],
[4.],
[1.],
[4.],
[0.],
[3.],
[0.],
[0.],
[0.],
[0.]])

ordinal_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],


dtype=object)]

from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

<16512x5 sparse matrix of type '<class 'numpy.float64'>'


with 16512 stored elements in Compressed Sparse Row format>

housing_cat_1hot.toarray()

array([[0., 1., 0., 0., 0.],


[0., 0., 0., 0., 1.],
[0., 1., 0., 0., 0.],
...,
[1., 0., 0., 0., 0.],
[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.]])

cat_encoder = OneHotEncoder(sparse_output=False)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

array([[0., 1., 0., 0., 0.],


[0., 0., 0., 0., 1.],
[0., 1., 0., 0., 0.],
...,
[1., 0., 0., 0., 0.],
[1., 0., 0., 0., 0.],
[0., 1., 0., 0., 0.]])

Custom Transformers
from sklearn.base import BaseEstimator, TransformerMixin

# column index
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
def __init__(self, add_bedrooms_per_room=True): # no *args or
**kargs
self.add_bedrooms_per_room = add_bedrooms_per_room
def fit(self, X, y=None):
return self # nothing else to do
def transform(self, X):
rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
population_per_household = X[:, population_ix] / X[:,
households_ix]
if self.add_bedrooms_per_room:
bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
return np.c_[X, rooms_per_household,
population_per_household,
bedrooms_per_room]
else:
return np.c_[X, rooms_per_household,
population_per_household]

attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
housing_extra_attribs = attr_adder.transform(housing.values)

col_names = "total_rooms", "total_bedrooms", "population",


"households"
rooms_ix, bedrooms_ix, population_ix, households_ix = [
housing.columns.get_loc(c) for c in col_names] # get the column
indices

housing_extra_attribs = pd.DataFrame(
housing_extra_attribs,
columns=list(housing.columns)+["rooms_per_household",
"population_per_household"],
index=housing.index)
housing_extra_attribs.head()

longitude latitude housing_median_age total_rooms total_bedrooms


\
12655 -121.46 38.52 29.0 3873.0 797.0

15502 -117.23 33.09 7.0 5320.0 855.0

2908 -119.04 35.37 44.0 1618.0 310.0

14053 -117.13 32.75 24.0 1877.0 519.0

20496 -118.7 34.28 27.0 3536.0 646.0

population households median_income ocean_proximity


rooms_per_household \
12655 2237.0 706.0 2.1736 INLAND
5.485836
15502 2015.0 768.0 6.3373 NEAR OCEAN
6.927083
2908 667.0 300.0 2.875 INLAND
5.393333
14053 898.0 483.0 2.2264 NEAR OCEAN
3.886128
20496 1837.0 580.0 4.4964 <1H OCEAN
6.096552

population_per_household
12655 3.168555
15502 2.623698
2908 2.223333
14053 1.859213
20496 3.167241

Transformation Pipelines
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy="median")),
('attribs_adder', CombinedAttributesAdder()),
('std_scaler', StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

housing_num_tr

array([[-0.94135046, 1.34743822, 0.02756357, ..., 0.01739526,


0.00622264, -0.12112176],
[ 1.17178212, -1.19243966, -1.72201763, ..., 0.56925554,
-0.04081077, -0.81086696],
[ 0.26758118, -0.1259716 , 1.22045984, ..., -0.01802432,
-0.07537122, -0.33827252],
...,
[-1.5707942 , 1.31001828, 1.53856552, ..., -0.5092404 ,
-0.03743619, 0.32286937],
[-1.56080303, 1.2492109 , -1.1653327 , ..., 0.32814891,
-0.05915604, -0.45702273],
[-1.28105026, 2.02567448, -0.13148926, ..., 0.01407228,
0.00657083, -0.12169672]])

from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
full_pipeline = ColumnTransformer([
("num", num_pipeline, num_attribs),
("cat", OneHotEncoder(), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)

housing_prepared

array([[-0.94135046, 1.34743822, 0.02756357, ..., 0. ,


0. , 0. ],
[ 1.17178212, -1.19243966, -1.72201763, ..., 0. ,
0. , 1. ],
[ 0.26758118, -0.1259716 , 1.22045984, ..., 0. ,
0. , 0. ],
...,
[-1.5707942 , 1.31001828, 1.53856552, ..., 0. ,
0. , 0. ],
[-1.56080303, 1.2492109 , -1.1653327 , ..., 0. ,
0. , 0. ],
[-1.28105026, 2.02567448, -0.13148926, ..., 0. ,
0. , 0. ]])

housing_prepared.shape

(16512, 16)

Select and Train a Model


Training and Evaluating on the Training Set
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

LinearRegression()

# let's try the full preprocessing pipeline on a few training


instances
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predictions:", lin_reg.predict(some_data_prepared))

Predictions: [ 85657.90192014 305492.60737488 152056.46122456


186095.70946094
244550.67966089]
print("Labels:", list(some_labels))

Labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]

some_data_prepared

array([[-0.94135046, 1.34743822, 0.02756357, 0.58477745,


0.64037127,
0.73260236, 0.55628602, -0.8936472 , 0.01739526,
0.00622264,
-0.12112176, 0. , 1. , 0. , 0.
,
0. ],
[ 1.17178212, -1.19243966, -1.72201763, 1.26146668,
0.78156132,
0.53361152, 0.72131799, 1.292168 , 0.56925554, -
0.04081077,
-0.81086696, 0. , 0. , 0. , 0.
,
1. ],
[ 0.26758118, -0.1259716 , 1.22045984, -0.46977281, -
0.54513828,
-0.67467519, -0.52440722, -0.52543365, -0.01802432, -
0.07537122,
-0.33827252, 0. , 1. , 0. , 0.
,
0. ],
[ 1.22173797, -1.35147437, -0.37006852, -0.34865152, -
0.03636724,
-0.46761716, -0.03729672, -0.86592882, -0.59513997, -
0.10680295,
0.96120521, 0. , 0. , 0. , 0.
,
1. ],
[ 0.43743108, -0.63581817, -0.13148926, 0.42717947,
0.27279028,
0.37406031, 0.22089846, 0.32575178, 0.2512412 ,
0.00610923,
-0.47451338, 1. , 0. , 0. , 0.
,
0. ]])

from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print("root_mean_squared_error:", lin_rmse)

root_mean_squared_error: 68627.87390018745
from sklearn.metrics import mean_squared_error

lin_mse = mean_squared_error(housing_labels, housing_predictions)


print("mean_squared_error:", lin_mse)

mean_squared_error: 4709785076.060029

from sklearn.metrics import mean_absolute_error

lin_mae = mean_absolute_error(housing_labels, housing_predictions)


print("mean_absolute_error:", lin_mae)

mean_absolute_error: 49438.66860915801

You might also like