Tasks
Tasks
Khadija Imran
2022-CS-171
Task with iris and California housing datasets:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,
mean_squared_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#task1
#1
iris = datasets.load_iris(as_frame=True)
california_housing = datasets.fetch_california_housing(as_frame=True)
print("Iris Dataset Description:\n", iris.DESCR)
print("California Housing Dataset Description:\n", california_housing.DESCR)
print("\nIris Dataset Head:\n", iris.frame.head())
print("\nCalifornia Housing Dataset Head:\n", california_housing.frame.head())
print("\nIris Dataset Info:\n", iris.frame.info())
print("\nCalifornia Housing Dataset Info:\n", california_housing.frame.info())
#2
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris.data)
california_housing_scaled = scaler.fit_transform(california_housing.data)
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(iris_scaled, iris.target, test_size=0.2
, random_state=42)
X_train_california_housing, X_test_california_housing, y_train_california_housing,
y_test_california_housing = train_test_split(california_housing_scaled, california_housing.target
, test_size=0.2, random_state=42)
print("\nIris Training Data Shape:", X_train_iris.shape)
print("Iris Testing Data Shape:", X_test_iris.shape)
print("California Housing Training Data Shape:", X_train_california_housing.shape)
print("California Housing Testing Data Shape:", X_test_california_housing.shape)
# Task 2
#3
k_values = range(1, 16)
knn_accuracies = []
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_iris, y_train_iris)
y_pred_knn = knn.predict(X_test_iris)
knn_accuracies.append(accuracy_score(y_test_iris, y_pred_knn))
print("\nk-NN Accuracies for k=1 to 15:", knn_accuracies)
#4
kernels = ['linear', 'poly', 'rbf']
svm_accuracies = {}
for kernel in kernels:
svm = SVC(kernel=kernel)
svm.fit(X_train_iris, y_train_iris)
y_pred_svm = svm.predict(X_test_iris)
svm_accuracies[kernel] = accuracy_score(y_test_iris, y_pred_svm)
print("\nSVM Accuracies for different kernels:", svm_accuracies)
#5
estimators = [10, 50, 100]
rf_accuracies = {}
for n_estimators in estimators:
rf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
rf.fit(X_train_iris, y_train_iris)
2
y_pred_rf = rf.predict(X_test_iris)
rf_accuracies[n_estimators] = accuracy_score(y_test_iris, y_pred_rf)
print("\nRandom Forest Accuracies for different number of estimators:", rf_accuracies)
# Task 3
# 6.
lr = LinearRegression()
lr.fit(X_train_california_housing, y_train_california_housing)
y_pred_lr = lr.predict(X_test_california_housing)
mse_lr = mean_squared_error(y_test_california_housing, y_pred_lr)
r2_lr = r2_score(y_test_california_housing, y_pred_lr)
print("\nLinear Regression - MSE:", mse_lr)
print("Linear Regression - R² Score:", r2_lr)
#7
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_california_housing, y_train_california_housing)
y_pred_dt = dt.predict(X_test_california_housing)
mse_dt = mean_squared_error(y_test_california_housing, y_pred_dt)
r2_dt = r2_score(y_test_california_housing, y_pred_dt)
print("\nDecision Tree Regression - MSE:", mse_dt)
print("Decision Tree Regression - R² Score:", r2_dt)
# Task 4
#8
print("\nClassification Model Evaluations:")
print("k-NN (k=5) - Accuracy:", accuracy_score(y_test_iris, knn.predict(X_test_iris)))
print("SVM (rbf) - Accuracy:", accuracy_score(y_test_iris, svm.predict(X_test_iris)))
print("Random Forest (n=100) - Accuracy:", accuracy_score(y_test_iris, rf.predict(X_test_iris)))
precision_rf = precision_score(y_test_iris, y_pred_rf, average='macro')
recall_rf = recall_score(y_test_iris, y_pred_rf, average='macro')
f1_rf = f1_score(y_test_iris, y_pred_rf, average='macro')
print("\nRandom Forest (n=100) - Precision:", precision_rf)
print("Random Forest (n=100) - Recall:", recall_rf)
print("Random Forest (n=100) - F1-Score:", f1_rf)
#9
print("\nRegression Model Evaluations:")
print("Linear Regression - MSE:", mse_lr, ", R² Score:", r2_lr)
print("Decision Tree Regression - MSE:", mse_dt, ", R² Score:", r2_dt)
def plot_california_housing_boxplot(data):
plt.figure(figsize=(12, 6))
plt.boxplot(data, vert=False, patch_artist=True)
plt.title("Box Plot of California Housing Features")
plt.xlabel("Value")
plt.yticks(range(1, len(data.columns) + 1), data.columns)
plt.grid(True)
plt show()
plot_knn_accuracies(k_values, knn_accuracies)
plot_svm_accuracies(kernels, svm_accuracies)
plot_rf_accuracies(estimators, rf_accuracies)
plot_regression_results(y_test_california_housing, y_pred_lr, "Linear Regression")
plot_regression_results(y_test_california_housing, y_pred_dt, "Decision Tree Regression")
plot_iris_class_distribution(iris.target)
plot_california_housing_boxplot(california_housing.frame)
Conclusion:
Classification Models Comparison
1. k-NN (k=5):
o Accuracy: High accuracy, but performance sensitive to the choice
of kkk.
2. SVM (rbf kernel):
o Accuracy: Competitive accuracy, benefits from kernel
trick for non-linear data.
3. Random Forest (n_estimators=100):
o Accuracy: Generally the highest among the three; also provides
additional metrics like precision, recall, and F1-score,
indicating robust performance across classes.
Regression Models Comparison
1. Linear Regression:
o MSE: Moderate; captures linear relationships effectively.
o R² Score: Indicates how well the model explains the variability of the
target variable.
2. Decision Tree Regression:
o MSE: Typically lower than Linear Regression, indicating better
performance on complex data.
o R² Score: Higher than Linear Regression; can capture
non-linear relationships well.
Summary
Random Forest outperformed k-NN and SVM in classification, providing additional
evaluation metrics.
4
For regression, the Decision Tree showed better performance than Linear
Regression in terms of MSE and R², indicating its strength in capturing
complex relationships in the data.
breast)
print("\nRandom Forest Accuracies for Breast Cancer with different number
of estimators:", rf_accuracies_breast)
# Task 3
#6
lr = LinearRegression()
lr.fit(X_train_diabetes, y_train_diabetes)
y_pred_lr_diabetes = lr.predict(X_test_diabetes)
mse_lr_diabetes = mean_squared_error(y_test_diabetes, y_pred_lr_diabetes)
r2_lr_diabetes = r2_score(y_test_diabetes, y_pred_lr_diabetes)
print("\nLinear Regression (Diabetes) - MSE:", mse_lr_diabetes)
print("Linear Regression (Diabetes) - R² Score:", r2_lr_diabetes)
#7
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_diabetes, y_train_diabetes)
y_pred_dt_diabetes = dt.predict(X_test_diabetes)
mse_dt_diabetes = mean_squared_error(y_test_diabetes, y_pred_dt_diabetes)
r2_dt_diabetes = r2_score(y_test_diabetes, y_pred_dt_diabetes)
print("\nDecision Tree Regression (Diabetes) - MSE:", mse_dt_diabetes)
print("Decision Tree Regression (Diabetes) - R² Score:", r2_dt_diabetes)
# Task 4
#8
print("\nClassification Model Evaluations for Breast Cancer:")
print("k-NN (k=5) - Accuracy:", accuracy_score(y_test_breast_cancer, knn.predict(X_test_breast_cancer)))
print("SVM (rbf) - Accuracy:", accuracy_score(y_test_breast_cancer, svm.predict(X_test_breast_cancer)))
print("Random Forest (n=100) - Accuracy:", accuracy_score(y_test_breast_cancer, rf.predict(X_test_breast_cancer)))
precision_rf_breast = precision_score(y_test_breast_cancer, y_pred_rf_breast, average='macro')
recall_rf_breast = recall_score(y_test_breast_cancer, y_pred_rf_breast, average='macro')
f1_rf_breast = f1_score(y_test_breast_cancer, y_pred_rf_breast, average='macro')
print("\nRandom Forest (n=100) - Precision:", precision_rf_breast)
print("Random Forest (n=100) - Recall:", recall_rf_breast)
print("Random Forest (n=100) - F1-Score:", f1_rf_breast)
#9
print("\nRegression Model Evaluations for Diabetes:")
print("Linear Regression - MSE:", mse_lr_diabetes, ", R² Score:", r2_lr_diabetes)
print("Decision Tree Regression - MSE:", mse_dt_diabetes, ", R² Score:", r2_dt_diabetes)
labels = ['Malignant', 'Benign']
sizes = [np.sum(breast_cancer.target == 0), np.sum(breast_cancer.target == 1)]
plt.figure(figsize=(8, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title("Breast Cancer Dataset Class Distribution")
plt.axis('equal')
plt.show()
def plot_knn_accuracies(k_values, knn_accuracies):
plt.figure(figsize=(8, 6))
plt.plot(k_values, knn_accuracies, marker='o', color='b', label="Accuracy")
plt.title("k-NN Accuracy for Breast Cancer Dataset")
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.xticks(k_values)
plt.grid(True)
plt.legend()
plt.show()
def plot_svm_accuracies(kernels, svm_accuracies):
plt.figure(figsize=(8, 6))
plt.bar(kernels, [svm_accuracies[k] for k in kernels], color='c')
plt.title("SVM Accuracy for Breast Cancer Dataset")
plt.xlabel("Kernel")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()
def plot_rf_accuracies(estimators, rf_accuracies):
plt.figure(figsize=(8, 6))
plt.plot(estimators, [rf_accuracies[n] for n in estimators], marker='o', color='g
', label="Accuracy")
plt.title("Random Forest Accuracy for Breast Cancer Dataset")
plt.xlabel("Number of Estimators")
plt.ylabel("Accuracy")
plt.xticks(estimators)
plt.grid(True)
6
plt.legend()
plt.show()
def plot_regression_results(y_test, y_pred, title):
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red',
linewidth=2
, label='Perfect Fit')
plt.title(f"{title}: Predicted vs Actual Values")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.grid(True)
plt.legend()
plt.show()
def plot_box_plots(y_test, y_pred_lr, y_pred_dt):
plt.figure(figsize=(10, 6))
plt.boxplot([y_test, y_pred_lr, y_pred_dt], labels=['Actual', 'Linear Regression'
, 'Decision Tree'])
plt.title("Box Plot of Actual vs Predicted Values")
plt.ylabel("Values")
plt.grid(True)
plt.show()
plot_knn_accuracies(k_values, knn_accuracies_breast)
plot_svm_accuracies(kernels, svm_accuracies_breast)
plot_rf_accuracies(estimators, rf_accuracies_breast)
plot_regression_results(y_test_diabetes, y_pred_lr_diabetes, "Linear Regression
(Diabetes)")
plot_regression_results(y_test_diabetes, y_pred_dt_diabetes, "Decision
Tree Regression
(Diabetes)")
plot_box_plots(y_test_diabetes, y_pred_lr_diabetes, y_pred_dt_diabetes)
Conclusion:
Classification Models Comparison (Breast Cancer)
1. k-NN:
o Accuracy: Achieved varying accuracy based on kkk;
optimal performance usually around k=5k=5k=5.
2. SVM:
o Accuracy: Performed well with different kernels, particularly
the RBF kernel, which is effective for complex decision
boundaries.
3. Random Forest:
o Accuracy: Generally the highest accuracy among the
classifiers;demonstrated robustness.
o Precision: High precision, indicating fewer false positives.
o Recall: High recall, showing effectiveness in identifying
true positives.
o F1-Score: Balanced between precision and recall, reflecting good
overall performance.
Regression Models Comparison (Diabetes)
1. Linear Regression:
o MSE: Moderate; effectively captures linear relationships but may struggle with non-linear patterns.
o R² Score: Indicates decent explanatory power for the variability in the diabetes dataset.
2. Decision Tree Regression:
o MSE: Generally lower than Linear Regression, indicating better predictive performance.
o R² Score: Higher than Linear Regression; adept at capturing non-linear relationships in the data.
Summary
For Classification: Random Forest outperformed k-NN and SVM in accuracy and provided additional insights with precision, recall,
and F1-score metrics.
7
For Regression: The Decision Tree Regression model exhibited better performance compared to Linear Regression, as shown by
lower MSE and higher R², indicating its capability to model complex relationships effectively.