0% found this document useful (0 votes)
6 views

message (2)

The document is a Python script for a machine learning homework assignment focused on regression techniques, specifically Linear and Ridge Regression using the California housing dataset. It includes functions for data preprocessing, model training, evaluation, and visualization of results. The script also compares the performance of different models and regularization techniques using metrics like RMSE and R².

Uploaded by

jjie9622
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
6 views

message (2)

The document is a Python script for a machine learning homework assignment focused on regression techniques, specifically Linear and Ridge Regression using the California housing dataset. It includes functions for data preprocessing, model training, evaluation, and visualization of results. The script also compares the performance of different models and regularization techniques using metrics like RMSE and R².

Uploaded by

jjie9622
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

# Homework 3 - main file

# COMP.4220 Machine Learning

import itertools, functools


import numpy as np
import matplotlib.pyplot as plt
from regression import LinearRegression, RidgeRegression
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge as skRidge
from sklearn.linear_model import LinearRegression as skLinearRegression
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler as skStandardScalar
from sklearn.preprocessing import StandardScaler

def train_test_split(X, t, test_size=0.2, random_state=None):


"""Splits data into training and testing sets using only NumPy."""

if random_state:
np.random.seed(random_state)

# ---- Part (d) ---- #


# 1. Shuffle the data

indices = np.arange(1)
X = X
t = t

# 2. Split the data


#split_index = 1

X_train = X
X_test = []
t_train = t
t_test = []

return X_train, X_test, t_train, t_test

def standardscalar(x: np.ndarray):


# ---- Part (b) ---- #
Xs = X
Xs = standardscalar(X)

return (x - np.mean(x, axis=0)) / np.std(x, axis=0)

class PolynomialFeature(object):
def __init__(self, degree=2):
assert isinstance(degree, int)
self.degree = degree

def transform(self, x):


if x.ndim == 1:
x = x[:, None]
x_t = x.transpose()
features = [np.ones(len(x))]
for degree in range(1, self.degree + 1):
for items in itertools.combinations_with_replacement(x_t, degree):
features.append(functools.reduce(lambda x, y: x * y, items))
return np.asarray(features).transpose()

def main():
# ---- Part (a) ---- #
housing = fetch_california_housing()
X = housing.data
t = housing.target
print(X.shape, t.shape)
print(housing.feature_names[:6])
print(housing.DESCR)

# ---- Part (b) ---- #


def standardscalar(x):
mean = np.mean(x, axis=0)
std = np.std(x, axis=0)
return (x - mean) / std

X_scaled = standardscalar(X)

# Standardize the data using sklearn's StandardScaler


scaler = StandardScaler()
X_sklearn_scaled = scaler.fit_transform(X)
print(np.allclose(X_scaled, X_sklearn_scaled)) # Should be True
print(np.allclose(np.mean(X_scaled, axis=0), np.mean(X_sklearn_scaled,
axis=0))) # Should be True
print(np.allclose(np.std(X_scaled, axis=0), np.std(X_sklearn_scaled, axis=0)))
# Should be True

# ---- Part (c) ---- #


X_train, X_test, t_train, t_test = train_test_split(X, t, test_size=0.2,
random_state=42)
scaler = StandardScaler()
X_sk_standardized = scaler.fit_transform(X)
print((Xs - Xss))

# ---- Part (d) ---- #


class RidgeRegression:
def __init__(self, lambda_):
self.lambda_ = lambda_

def fit(self, X, t):


# Add the regularization term (lambda * I)
I = np.identity(X.shape[1])
self.w = np.linalg.inv(X.T @ X + self.lambda_ * I) @ X.T @ t

def predict(self, X):


return X @ self.w
X_train, X_test, t_train, t_test = [], [], [], []

# ---- Part (k) ---- #


linreg = LinearRegression()
linreg.fit(X_train, t_train)

# Predictions and evaluation for Linear Regression


t_pred_linreg = linreg.predict(X_test)
rmse_linreg = np.sqrt(mean_squared_error(t_test, t_pred_linreg))
r2_linreg = r2_score(t_test, t_pred_linreg)

print(f'Linear Regression RMSE: {rmse_linreg}')


print(f'Linear Regression R²: {r2_linreg}')

# Compare the performance with the best Ridge model (with lambda = 1.0)
best_ridge_model = ridge_models[lambdas.index(1.0)]
t_pred_best_ridge = best_ridge_model.predict(X_test)
rmse_best_ridge = np.sqrt(mean_squared_error(t_test, t_pred_best_ridge))
r2_best_ridge = r2_score(t_test, t_pred_best_ridge)

print(f'Best Ridge Regression (lambda=1.0) RMSE: {rmse_best_ridge}')


print(f'Best Ridge Regression (lambda=1.0) R²: {r2_best_ridge}')

# Comparing the results


print("\nComparison of Linear Regression and Ridge Regression (lambda=1.0):")
print(f"Linear Regression RMSE: {rmse_linreg}, R²: {r2_linreg}")
print(f"Ridge Regression (lambda=1.0) RMSE: {rmse_best_ridge}, R²:
{r2_best_ridge}")

# ---- Part (g, h) ---- #


# (g) Plotting true vs predicted for both models
plt.figure(figsize=(12, 6))

# Plot for Ridge Regression


plt.subplot(1, 2, 1)
plt.scatter(t_test, t_pred, color='blue')
plt.plot([t_test.min(), t_test.max()], [t_test.min(), t_test.max()],
color='red')
plt.title('Ridge Regression: True vs Predicted')

# Plot for Linear Regression


plt.subplot(1, 2, 2)
plt.scatter(t_test, t_pred_linreg, color='green')
plt.plot([t_test.min(), t_test.max()], [t_test.min(), t_test.max()],
color='red')
plt.title('Linear Regression: True vs Predicted')

plt.show()

# (h) Try different values of regularization lambda for Ridge regression


ridge_2 = RidgeRegression(lambda_=10.0)
ridge_2.fit(X_train, t_train)
t_pred_ridge_2 = ridge_2.predict(X_test)

# Evaluate new Ridge regression


rmse_ridge_2 = np.sqrt(mean_squared_error(t_test, t_pred_ridge_2))
r2_ridge_2 = r2_score(t_test, t_pred_ridge_2)

print(f'Ridge Regression (lambda=10) RMSE: {rmse_ridge_2}')


print(f'Ridge Regression (lambda=10) R²: {r2_ridge_2}')
# Model building
lr = LinearRegression()
y_lr = []
print('Linear Regression results')
print(f'RMSE: {np.inf}')
print(f'R2: {np.inf}')
rr = RidgeRegression(lambd=1.0)
y_rr = []
print('Ridge Regression results')
print(f'RMSE: {np.inf}')
print(f'R2: {np.inf}')

# ---- Part (i) ---- #


# (i) Ridge Regression for different lambda values
lambdas = [0.1, 1.0, 10.0, 100.0]
ridge_models = []
rmse_values = []
r2_values = []

for lambda_val in lambdas:


ridge_model = RidgeRegression(lambda_=lambda_val)
ridge_model.fit(X_train, t_train)
t_pred_ridge = ridge_model.predict(X_test)

rmse_ridge = np.sqrt(mean_squared_error(t_test, t_pred_ridge))


r2_ridge = r2_score(t_test, t_pred_ridge)

ridge_models.append(ridge_model)
rmse_values.append(rmse_ridge)
r2_values.append(r2_ridge)

lr_sk = skLinearRegression()
y_lr_sk = []
print('Sklearn Linear Regression results')
print(f'RMSE: {np.inf}')
print(f'R2: {np.inf}')

rr_sk = skRidge(alpha=1.0)
y_rr_sk = []
print('Sklearn Ridge Regression results')
print(f'RMSE: {np.inf}')
print(f'R2: {np.inf}')

# ---- Part (j) ---- #


plt.figure(figsize=(12, 6))

# Plot RMSE
plt.subplot(1, 2, 1)
plt.plot(lambdas, rmse_values, marker='o', linestyle='-', color='b')
plt.xscale('log')
plt.title('RMSE for Ridge Regression with different lambda values')
plt.xlabel('Lambda')
plt.ylabel('RMSE')

# Plot R²
plt.subplot(1, 2, 2)
plt.plot(lambdas, r2_values, marker='o', linestyle='-', color='r')
plt.xscale('log')
plt.title('R² for Ridge Regression with different lambda values')
plt.xlabel('Lambda')
plt.ylabel('R²')
plt.show()

# Plot the results


plt.figure(figsize=(12, 6))

plt.subplot(2, 2, 1)
# use scatter and plot to show the results
plt.xlabel('add a proper label')
plt.ylabel('add a proper label')
plt.title('add a proper title')

plt.subplot(2, 2, 2)
# use scatter and plot to show the results
plt.xlabel('add a proper label')
plt.ylabel('add a proper label')
plt.title('add a proper title')

plt.subplot(2, 2, 3)
# use scatter and plot to show the results
plt.xlabel('add a proper label')
plt.ylabel('add a proper label')
plt.title('add a proper title')

plt.subplot(2, 2, 4)
# use scatter and plot to show the results
plt.xlabel('add a proper label')
plt.ylabel('add a proper label')
plt.title('add a proper title')

plt.tight_layout()
plt.show()

if __name__=='__main__':
main()

You might also like