0% found this document useful (0 votes)
5 views7 pages

Tasks

Uploaded by

Muhammad Ali
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
5 views7 pages

Tasks

Uploaded by

Muhammad Ali
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 7

1

Khadija Imran
2022-CS-171
Task with iris and California housing datasets:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,
mean_squared_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
#task1
#1
iris = datasets.load_iris(as_frame=True)
california_housing = datasets.fetch_california_housing(as_frame=True)
print("Iris Dataset Description:\n", iris.DESCR)
print("California Housing Dataset Description:\n", california_housing.DESCR)
print("\nIris Dataset Head:\n", iris.frame.head())
print("\nCalifornia Housing Dataset Head:\n", california_housing.frame.head())
print("\nIris Dataset Info:\n", iris.frame.info())
print("\nCalifornia Housing Dataset Info:\n", california_housing.frame.info())
#2
scaler = StandardScaler()
iris_scaled = scaler.fit_transform(iris.data)
california_housing_scaled = scaler.fit_transform(california_housing.data)
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(iris_scaled, iris.target, test_size=0.2
, random_state=42)
X_train_california_housing, X_test_california_housing, y_train_california_housing,
y_test_california_housing = train_test_split(california_housing_scaled, california_housing.target
, test_size=0.2, random_state=42)
print("\nIris Training Data Shape:", X_train_iris.shape)
print("Iris Testing Data Shape:", X_test_iris.shape)
print("California Housing Training Data Shape:", X_train_california_housing.shape)
print("California Housing Testing Data Shape:", X_test_california_housing.shape)
# Task 2
#3
k_values = range(1, 16)
knn_accuracies = []
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_iris, y_train_iris)
y_pred_knn = knn.predict(X_test_iris)
knn_accuracies.append(accuracy_score(y_test_iris, y_pred_knn))
print("\nk-NN Accuracies for k=1 to 15:", knn_accuracies)

#4
kernels = ['linear', 'poly', 'rbf']
svm_accuracies = {}
for kernel in kernels:
svm = SVC(kernel=kernel)
svm.fit(X_train_iris, y_train_iris)
y_pred_svm = svm.predict(X_test_iris)
svm_accuracies[kernel] = accuracy_score(y_test_iris, y_pred_svm)
print("\nSVM Accuracies for different kernels:", svm_accuracies)
#5
estimators = [10, 50, 100]
rf_accuracies = {}
for n_estimators in estimators:
rf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
rf.fit(X_train_iris, y_train_iris)
2

y_pred_rf = rf.predict(X_test_iris)
rf_accuracies[n_estimators] = accuracy_score(y_test_iris, y_pred_rf)
print("\nRandom Forest Accuracies for different number of estimators:", rf_accuracies)
# Task 3
# 6.
lr = LinearRegression()
lr.fit(X_train_california_housing, y_train_california_housing)
y_pred_lr = lr.predict(X_test_california_housing)
mse_lr = mean_squared_error(y_test_california_housing, y_pred_lr)
r2_lr = r2_score(y_test_california_housing, y_pred_lr)
print("\nLinear Regression - MSE:", mse_lr)
print("Linear Regression - R² Score:", r2_lr)

#7
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_california_housing, y_train_california_housing)
y_pred_dt = dt.predict(X_test_california_housing)
mse_dt = mean_squared_error(y_test_california_housing, y_pred_dt)
r2_dt = r2_score(y_test_california_housing, y_pred_dt)
print("\nDecision Tree Regression - MSE:", mse_dt)
print("Decision Tree Regression - R² Score:", r2_dt)
# Task 4
#8
print("\nClassification Model Evaluations:")
print("k-NN (k=5) - Accuracy:", accuracy_score(y_test_iris, knn.predict(X_test_iris)))
print("SVM (rbf) - Accuracy:", accuracy_score(y_test_iris, svm.predict(X_test_iris)))
print("Random Forest (n=100) - Accuracy:", accuracy_score(y_test_iris, rf.predict(X_test_iris)))
precision_rf = precision_score(y_test_iris, y_pred_rf, average='macro')
recall_rf = recall_score(y_test_iris, y_pred_rf, average='macro')
f1_rf = f1_score(y_test_iris, y_pred_rf, average='macro')
print("\nRandom Forest (n=100) - Precision:", precision_rf)
print("Random Forest (n=100) - Recall:", recall_rf)
print("Random Forest (n=100) - F1-Score:", f1_rf)
#9
print("\nRegression Model Evaluations:")
print("Linear Regression - MSE:", mse_lr, ", R² Score:", r2_lr)
print("Decision Tree Regression - MSE:", mse_dt, ", R² Score:", r2_dt)

def plot_knn_accuracies(k_values, knn_accuracies):


plt.figure(figsize=(8, 6))
plt.plot(k_values, knn_accuracies, marker='o', color='b', label="Accuracy")
plt.title("k-NN Accuracy for Different k Values")
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.xticks(k_values)
plt.grid(True)
plt.legend()
plt.show()

def plot_svm_accuracies(kernels, svm_accuracies):


plt.figure(figsize=(8, 6))
plt.bar(kernels, [svm_accuracies[k] for k in kernels], color='c')
plt.title("SVM Accuracy for Different Kernels")
plt.xlabel("Kernel")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()

def plot_rf_accuracies(estimators, rf_accuracies):


plt.figure(figsize=(8, 6))
plt.plot(estimators, [rf_accuracies[n] for n in estimators], marker='o', color='g',
label="Accuracy")
plt.title("Random Forest Accuracy for Different Estimators")
plt.xlabel("Number of Estimators")
plt.ylabel("Accuracy")
plt.xticks(estimators)
plt.grid(True)
plt.legend()
plt.show()
3

def plot_regression_results(y_test, y_pred, title):


plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linewidth=2
, label='Perfect Fit')
plt.title(f"{title}: Predicted vs Actual Values")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.grid(True)
plt.legend()
plt.show()
def plot_iris_class_distribution(y):
labels, counts = np.unique(y, return_counts=True)
plt.figure(figsize=(8, 6))
plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title("Class Distribution in Iris Dataset")
plt.axis('equal')
plt.show()

def plot_california_housing_boxplot(data):
plt.figure(figsize=(12, 6))
plt.boxplot(data, vert=False, patch_artist=True)
plt.title("Box Plot of California Housing Features")
plt.xlabel("Value")
plt.yticks(range(1, len(data.columns) + 1), data.columns)
plt.grid(True)
plt show()
plot_knn_accuracies(k_values, knn_accuracies)
plot_svm_accuracies(kernels, svm_accuracies)
plot_rf_accuracies(estimators, rf_accuracies)
plot_regression_results(y_test_california_housing, y_pred_lr, "Linear Regression")
plot_regression_results(y_test_california_housing, y_pred_dt, "Decision Tree Regression")
plot_iris_class_distribution(iris.target)
plot_california_housing_boxplot(california_housing.frame)

Conclusion:
Classification Models Comparison
1. k-NN (k=5):
o Accuracy: High accuracy, but performance sensitive to the choice
of kkk.
2. SVM (rbf kernel):
o Accuracy: Competitive accuracy, benefits from kernel
trick for non-linear data.
3. Random Forest (n_estimators=100):
o Accuracy: Generally the highest among the three; also provides
additional metrics like precision, recall, and F1-score,
indicating robust performance across classes.
Regression Models Comparison
1. Linear Regression:
o MSE: Moderate; captures linear relationships effectively.
o R² Score: Indicates how well the model explains the variability of the
target variable.
2. Decision Tree Regression:
o MSE: Typically lower than Linear Regression, indicating better
performance on complex data.
o R² Score: Higher than Linear Regression; can capture
non-linear relationships well.
Summary
 Random Forest outperformed k-NN and SVM in classification, providing additional
evaluation metrics.
4

 For regression, the Decision Tree showed better performance than Linear
Regression in terms of MSE and R², indicating its strength in capturing
complex relationships in the data.

Tasks with breast cancer and diabetes datasets:


from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_squared_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
# Task 1
#1
breast_cancer = datasets.load_breast_cancer(as_frame=True)
diabetes = datasets.load_diabetes(as_frame=True)
print("Breast Cancer Dataset Description:\n", breast_cancer.DESCR)
print("Diabetes Dataset Description:\n", diabetes.DESCR)
print("\nBreast Cancer Dataset Head:\n", breast_cancer.frame.head())
print("\nDiabetes Dataset Head:\n", diabetes.frame.head())
print("\nBreast Cancer Dataset Info:\n", breast_cancer.frame.info())
print("\nDiabetes Dataset Info:\n", diabetes.frame.info())
#2
scaler = StandardScaler()
breast_cancer_scaled = scaler.fit_transform(breast_cancer.data)
diabetes_scaled = scaler.fit_transform(diabetes.data)
X_train_breast_cancer, X_test_breast_cancer, y_train_breast_cancer, y_test_breast_cancer = train_test_split(
breast_cancer_scaled, breast_cancer.target, test_size=0.2, random_state=42
)
X_train_diabetes, X_test_diabetes, y_train_diabetes, y_test_diabetes = train_test_split(
diabetes_scaled, diabetes.target, test_size=0.2, random_state=42
)

print("\nBreast Cancer Training Data Shape:", X_train_breast_cancer.shape)


print("Breast Cancer Testing Data Shape:", X_test_breast_cancer.shape)
print("Diabetes Training Data Shape:", X_train_diabetes.shape)
print("Diabetes Testing Data Shape:", X_test_diabetes.shape)
# Task 2
#3
k_values = range(1, 16)
knn_accuracies_breast = []
for k in k_values:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train_breast_cancer, y_train_breast_cancer)
y_pred_knn_breast = knn.predict(X_test_breast_cancer)
knn_accuracies_breast.append(accuracy_score(y_test_breast_cancer, y_pred_knn_breast))
print("\nk-NN Accuracies for Breast Cancer (k=1 to 15):", knn_accuracies_breast)
#4
kernels = ['linear', 'poly', 'rbf']
svm_accuracies_breast = {}
for kernel in kernels:
svm = SVC(kernel=kernel)
svm.fit(X_train_breast_cancer, y_train_breast_cancer)
y_pred_svm_breast = svm.predict(X_test_breast_cancer)
svm_accuracies_breast[kernel] = accuracy_score(y_test_breast_cancer, y_pred_svm_breast)
print("\nSVM Accuracies for Breast Cancer with different kernels:", svm_accuracies_breast)
#5
estimators = [10, 50, 100]
rf_accuracies_breast = {}
for n_estimators in estimators:
rf = RandomForestClassifier(n_estimators=n_estimators, random_state=42)
rf.fit(X_train_breast_cancer, y_train_breast_cancer)
y_pred_rf_breast = rf.predict(X_test_breast_cancer)
rf_accuracies_breast[n_estimators] = accuracy_score(y_test_breast_cancer, y_pred_rf_
5

breast)
print("\nRandom Forest Accuracies for Breast Cancer with different number
of estimators:", rf_accuracies_breast)
# Task 3
#6
lr = LinearRegression()
lr.fit(X_train_diabetes, y_train_diabetes)
y_pred_lr_diabetes = lr.predict(X_test_diabetes)
mse_lr_diabetes = mean_squared_error(y_test_diabetes, y_pred_lr_diabetes)
r2_lr_diabetes = r2_score(y_test_diabetes, y_pred_lr_diabetes)
print("\nLinear Regression (Diabetes) - MSE:", mse_lr_diabetes)
print("Linear Regression (Diabetes) - R² Score:", r2_lr_diabetes)
#7
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train_diabetes, y_train_diabetes)
y_pred_dt_diabetes = dt.predict(X_test_diabetes)
mse_dt_diabetes = mean_squared_error(y_test_diabetes, y_pred_dt_diabetes)
r2_dt_diabetes = r2_score(y_test_diabetes, y_pred_dt_diabetes)
print("\nDecision Tree Regression (Diabetes) - MSE:", mse_dt_diabetes)
print("Decision Tree Regression (Diabetes) - R² Score:", r2_dt_diabetes)
# Task 4
#8
print("\nClassification Model Evaluations for Breast Cancer:")
print("k-NN (k=5) - Accuracy:", accuracy_score(y_test_breast_cancer, knn.predict(X_test_breast_cancer)))
print("SVM (rbf) - Accuracy:", accuracy_score(y_test_breast_cancer, svm.predict(X_test_breast_cancer)))
print("Random Forest (n=100) - Accuracy:", accuracy_score(y_test_breast_cancer, rf.predict(X_test_breast_cancer)))
precision_rf_breast = precision_score(y_test_breast_cancer, y_pred_rf_breast, average='macro')
recall_rf_breast = recall_score(y_test_breast_cancer, y_pred_rf_breast, average='macro')
f1_rf_breast = f1_score(y_test_breast_cancer, y_pred_rf_breast, average='macro')
print("\nRandom Forest (n=100) - Precision:", precision_rf_breast)
print("Random Forest (n=100) - Recall:", recall_rf_breast)
print("Random Forest (n=100) - F1-Score:", f1_rf_breast)
#9
print("\nRegression Model Evaluations for Diabetes:")
print("Linear Regression - MSE:", mse_lr_diabetes, ", R² Score:", r2_lr_diabetes)
print("Decision Tree Regression - MSE:", mse_dt_diabetes, ", R² Score:", r2_dt_diabetes)
labels = ['Malignant', 'Benign']
sizes = [np.sum(breast_cancer.target == 0), np.sum(breast_cancer.target == 1)]
plt.figure(figsize=(8, 6))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
plt.title("Breast Cancer Dataset Class Distribution")
plt.axis('equal')
plt.show()
def plot_knn_accuracies(k_values, knn_accuracies):
plt.figure(figsize=(8, 6))
plt.plot(k_values, knn_accuracies, marker='o', color='b', label="Accuracy")
plt.title("k-NN Accuracy for Breast Cancer Dataset")
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.xticks(k_values)
plt.grid(True)
plt.legend()
plt.show()
def plot_svm_accuracies(kernels, svm_accuracies):
plt.figure(figsize=(8, 6))
plt.bar(kernels, [svm_accuracies[k] for k in kernels], color='c')
plt.title("SVM Accuracy for Breast Cancer Dataset")
plt.xlabel("Kernel")
plt.ylabel("Accuracy")
plt.grid(True)
plt.show()
def plot_rf_accuracies(estimators, rf_accuracies):
plt.figure(figsize=(8, 6))
plt.plot(estimators, [rf_accuracies[n] for n in estimators], marker='o', color='g
', label="Accuracy")
plt.title("Random Forest Accuracy for Breast Cancer Dataset")
plt.xlabel("Number of Estimators")
plt.ylabel("Accuracy")
plt.xticks(estimators)
plt.grid(True)
6

plt.legend()
plt.show()
def plot_regression_results(y_test, y_pred, title):
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red',
linewidth=2
, label='Perfect Fit')
plt.title(f"{title}: Predicted vs Actual Values")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.grid(True)
plt.legend()
plt.show()
def plot_box_plots(y_test, y_pred_lr, y_pred_dt):
plt.figure(figsize=(10, 6))
plt.boxplot([y_test, y_pred_lr, y_pred_dt], labels=['Actual', 'Linear Regression'
, 'Decision Tree'])
plt.title("Box Plot of Actual vs Predicted Values")
plt.ylabel("Values")
plt.grid(True)
plt.show()
plot_knn_accuracies(k_values, knn_accuracies_breast)
plot_svm_accuracies(kernels, svm_accuracies_breast)
plot_rf_accuracies(estimators, rf_accuracies_breast)
plot_regression_results(y_test_diabetes, y_pred_lr_diabetes, "Linear Regression
(Diabetes)")
plot_regression_results(y_test_diabetes, y_pred_dt_diabetes, "Decision
Tree Regression
(Diabetes)")
plot_box_plots(y_test_diabetes, y_pred_lr_diabetes, y_pred_dt_diabetes)

Conclusion:
Classification Models Comparison (Breast Cancer)
1. k-NN:
o Accuracy: Achieved varying accuracy based on kkk;
optimal performance usually around k=5k=5k=5.
2. SVM:
o Accuracy: Performed well with different kernels, particularly
the RBF kernel, which is effective for complex decision
boundaries.
3. Random Forest:
o Accuracy: Generally the highest accuracy among the
classifiers;demonstrated robustness.
o Precision: High precision, indicating fewer false positives.
o Recall: High recall, showing effectiveness in identifying
true positives.
o F1-Score: Balanced between precision and recall, reflecting good
overall performance.
Regression Models Comparison (Diabetes)
1. Linear Regression:
o MSE: Moderate; effectively captures linear relationships but may struggle with non-linear patterns.
o R² Score: Indicates decent explanatory power for the variability in the diabetes dataset.
2. Decision Tree Regression:
o MSE: Generally lower than Linear Regression, indicating better predictive performance.
o R² Score: Higher than Linear Regression; adept at capturing non-linear relationships in the data.
Summary
 For Classification: Random Forest outperformed k-NN and SVM in accuracy and provided additional insights with precision, recall,
and F1-score metrics.
7

 For Regression: The Decision Tree Regression model exhibited better performance compared to Linear Regression, as shown by
lower MSE and higher R², indicating its capability to model complex relationships effectively.

You might also like