0% found this document useful (0 votes)
5 views

a

Uploaded by

Houssam Alrifaii
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views

a

Uploaded by

Houssam Alrifaii
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 2

# Import the necessary libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset from CSV file named "house.csv" into a pandas DataFrame
df = pd.read_csv("house.csv")

# Check the DataFrame to see if there are any duplicate records and print the value
duplicates = df.duplicated().sum()
print(f"Number of duplicate records: {duplicates}")

# Drop unnecessary fields (House_Id in this case) and determine the features and
the target fields
df = df.drop(columns=["House_Id"])
features = ['Area', 'Bedrooms', 'Bathrooms', 'Neighborhood']
target = 'Price'

# Check for any missing values


missing_values = df.isnull().sum()
print(f"Missing values in each column:\n{missing_values}")

# Calculate the average area of all houses in the dataset


average_area = df['Area'].mean()
print(f"Average area of all houses: {average_area}")

# Perform one-hot encoding on the 'Neighborhood' feature


encoder = OneHotEncoder(sparse=False)
neighborhood_encoded = pd.DataFrame(encoder.fit_transform(df[['Neighborhood']]))
neighborhood_encoded.columns = encoder.get_feature_names_out(['Neighborhood'])
df = pd.concat([df.drop(columns=['Neighborhood']), neighborhood_encoded], axis=1)

# Split the data into training and testing sets


X = df.drop(columns=[target])
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Initialize the ML model and train it using Linear Regression


model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model using 3 metrics


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error: {mae}")


print(f"Mean Squared Error: {mse}")
print(f"R-Squared Score: {r2}")
# Display evaluation metrics
metrics = pd.DataFrame({
"Metric": ["Mean Absolute Error", "Mean Squared Error", "R-Squared Score"],
"Value": [mae, mse, r2]
})
print(metrics)

# Create a scatter plot to visualize the relationship between the number of


bathrooms and the price
plt.figure(figsize=(8, 6))
plt.scatter(df['Bathrooms'], df['Price'], alpha=0.5, color='blue')
plt.title("Bathrooms vs Price")
plt.xlabel("Number of Bathrooms")
plt.ylabel("Price")
plt.grid(True)
plt.show()

# Find the house with the highest number of bedrooms and print its neighborhood
max_bedrooms_house = df[df['Bedrooms'] == df['Bedrooms'].max()]
print(f"Neighborhood of the house with the most bedrooms:
{max_bedrooms_house['Neighborhood'].values}")

# Plot the performance metrics calculated above on a heatmap graph


metrics_values = np.array([[mae, mse, r2]])
plt.figure(figsize=(8, 4))
sns.heatmap(metrics_values, annot=True, fmt=".2f", cmap="Blues",
xticklabels=["MAE", "MSE", "R2"], yticklabels=["Model"])
plt.title("Model Performance Metrics")
plt.show()

You might also like