import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Generating a sample dataset with multicollinearity
np.random.seed(0)
size = 300 # Increased sample size for better effect of Ridge regression
# Generate predictors X1, X2, and X3 where X2 is highly correlated with X1 and X3 is somewhat correlated with both
X1 = np.random.normal(0, 1, size)
X2 = X1 + np.random.normal(0, 0.1, size) # X2 is highly correlated with X1
X3 = 0.5 * X1 + 0.5 * X2 + np.random.normal(0, 0.1, size) # X3 is correlated with X1 and X2
# Generate a response variable with some noise
Y = 2 * X1 + 3 * X2 + 1.5 * X3 + np.random.normal(0, 1, size)
# Combine into a DataFrame
df = pd.DataFrame({'X1': X1, 'X2': X2, 'X3': X3, 'Y': Y})
# Calculating VIFs to show multicollinearity
vif_data = pd.DataFrame()
vif_data["feature"] = df.columns[:-1]
vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1] - 1)]
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df[['X1', 'X2', 'X3']], df['Y'], test_size=0.2, random_state=42)
# Fit a standard linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
# Fit a Ridge regression model with a higher alpha for a better effect on multicollinearity
ridge_model = Ridge(alpha=100)
ridge_model.fit(X_train, y_train)
y_pred_ridge = ridge_model.predict(X_test)
# Calculate the performance
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
# Output the results
print("Variance Inflation Factor (VIF):")
print(vif_data)
print("\nLinear Regression - MSE: {:.2f}, R2: {:.3f}".format(mse_lr, r2_lr))
print("Ridge Regression - MSE: {:.2f}, R2: {:.3f}".format(mse_ridge, r2_ridge))