Predicting House Prices
Step 1: Exploratory Data Analysis (EDA)
# Load the Boston Housing dataset
from sklearn.datasets import load_boston
boston = load_boston()
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df['PRICE'] = boston.target
# Display the first few rows
print("Initial Data:")
print(df.head())
# EDA: Summary statistics
print("\nSummary Statistics:")
print(df.describe())
# Visualize correlations
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
# Distribution of target variable
plt.figure(figsize=(8, 5))
sns.histplot(df['PRICE'], bins=30, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
Step 2: Data Preprocessing
We'll handle missing values and normalize the features.
python
Copy code
# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())
# Since there are no missing values in the Boston dataset, we can proceed to normalization
from sklearn.preprocessing import StandardScaler
# Normalize/Standardize features
scaler = StandardScaler()
features = df.drop('PRICE', axis=1)
features_scaled = scaler.fit_transform(features)
# Create a new DataFrame with scaled features
df_scaled = pd.DataFrame(features_scaled, columns=features.columns)
df_scaled['PRICE'] = df['PRICE'].values
Step 3: Data Splitting
We'll split the dataset into training and testing sets.
python
Copy code
from sklearn.model_selection import train_test_split
# Define features (X) and target variable (y)
X = df_scaled.drop('PRICE', axis=1)
y = df_scaled['PRICE']
# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"\nTraining set shape: {X_train.shape}, Testing set shape: {X_test.shape}")
Step 4: Model Implementation
We will implement three regression algorithms: Linear Regression, Decision Trees, and Random
Forests.
python
Copy code
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# Initialize models
linear_model = LinearRegression()
tree_model = DecisionTreeRegressor(random_state=42)
forest_model = RandomForestRegressor(random_state=42)
# Train models
linear_model.fit(X_train, y_train)
tree_model.fit(X_train, y_train)
forest_model.fit(X_train, y_train)
# Make predictions
y_pred_linear = linear_model.predict(X_test)
y_pred_tree = tree_model.predict(X_test)
y_pred_forest = forest_model.predict(X_test)
Step 5: Model Evaluation
We'll evaluate the models using Mean Absolute Error (MAE) and Root Mean Squared Error (RMSE).
python
Copy code
from sklearn.metrics import mean_absolute_error, mean_squared_error
# Evaluate models
def evaluate_model(y_true, y_pred, model_name):
mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f"\n{model_name} Performance:")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
evaluate_model(y_test, y_pred_linear, "Linear Regression")
evaluate_model(y_test, y_pred_tree, "Decision Tree")
evaluate_model(y_test, y_pred_forest, "Random Forest")