ML Manual
ML Manual
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv(r'housing.csv')
df.head()
df.shape
df.info()
df.nunique()
df.isnull().sum()
df.duplicated().sum()
df['total_bedrooms'].median()
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
for i in df.iloc[:,2:7]:
df[i] = df[i].astype('int')
df.describe().T
Numerical = df.select_dtypes(include=[np.number]).columns
print(Numerical)
for col in Numerical:
plt.figure(figsize=(10, 6))
df[col].plot(kind='hist', tle=col, bins=60, edgecolor='black')
plt.ylabel('Frequency')
plt.show()
for col in Numerical:
plt.figure(figsize=(6, 6))
sns.boxplot(df[col], color='blue')
plt. tle(col)
plt.ylabel(col)
plt.show()
Lab 2:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv(r'housing.csv')
variable_meaning = {
"MedInc": "Median income in block group",
"HouseAge": "Median house age in block group",
"AveRooms": "Average number of rooms per household",
"AveBedrms": "Average number of bedrooms per household",
"Popula on": "Popula on of block group",
"AveOccup": "Average number of household members",
"La tude": "La tude of block group",
"Longitude": "Longitude of block group",
"Target": "Median house value (in $100,000s)"
}
variable_df = pd.DataFrame(list(variable_meaning.items()), columns=["Feature", "Descrip on"])
print("\nVariable Meaning Table:")
print(variable_df)
plt.figure(figsize=(12, 8))
df.hist(figsize=(12, 8), bins=30, edgecolor='black')
plt.sup tle("Feature Distribu ons", fontsize=16)
plt.show()
plt.figure(figsize=(12, 6))
sns.boxplot(data=df.select_dtypes(include=[np.number]))
plt.x cks(rota on=45)
plt. tle("Boxplots of Features to Iden fy Outliers")
plt.show()
print(correla on_matrix)
plt.figure(figsize=(10, 8))
sns.heatmap(correla on_matrix, annot=True, cmap='coolwarm', fmt='.2f', square=True)
plt. tle("Correla on Heatmap of Features")
plt.show()
sns.pairplot(df.select_dtypes(include=[np.number]).sample(300), corner=True)
plt.sup tle("Pair Plot of Sampled Numerical Features", y=1.02)
plt.show()
Lab 3:
iris = load_iris()
features = iris.data
target = iris.target
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)
pca = PCA(n_components=2)
features_pca = pca.fit_transform(features_standardized)
import pandas as pd
data = pd.read_csv(r"training_data.csv")
def find_s_algorithm(data):
a ributes = data.iloc[:, :-1].values
target = data.iloc[:, -1].values
for i in range(len(target)):
if target[i] == "Yes":
hypothesis = a ributes[i].copy()
break
for i in range(len(target)):
if target[i] == "Yes":
for j in range(len(hypothesis)):
if hypothesis[j] != a ributes[i][j]:
hypothesis[j] = '?'
return hypothesis
final_hypothesis = find_s_algorithm(data)
print("Most Specific Hypothesis:", final_hypothesis)
lab5:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selec on import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
values = np.random.rand(100)
labels = []
for i in values[:50]:
if i <= 0.5:
labels.append('Class1')
else:
labels.append('Class2')
labels += [None] * 50
data = {
"Point": [f"x{i+1}" for i in range(100)],
"Value": values,
"Label": labels
}
print(data)
type(data)
print(dict)
df = pd.DataFrame(data)
df.nunique()
df.shape
df.describe().T
df.isnull().sum()
num_col = df.select_dtypes(include=['int', 'float']).columns
for col in num_col:
df[col].hist(bins=10, alpha=0.5, edgecolor='black', grid=False)
plt. tle(f'Histogram for {col}')
plt.xlabel(col)
plt.ylabel('Frequency')
plt.show()
lab 6:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.spa al.distance import cdist
X_train = df_lwr[['X']].values
y_train = df_lwr['Y'].values
X_train = np.hstack([np.ones((X_train.shape[0], 1)), X_train])
tau = 0.5
X_range = np.linspace(X_train[:, 1].min(), X_train[:, 1].max(), 100)
y_pred = []
for x in X_range:
x_vec = np.array([1, x]) # Intercept term
weights = np.exp(-cdist([[x]], X_train[:, 1:], 'sqeuclidean') / (2
* tau**2)).fla en()
W = np.diag(weights)
theta = np.linalg.pinv(X_train.T @ W @ X_train) @ (X_train.T @ W @
y_train)
y_pred.append(x_vec @ theta)
Lab 7:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selec on import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score
def linear_regression_california():
housing = fetch_california_housing(as_frame=True)
X = housing.data[["AveRooms"]]
y = housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
plt.sca er(X_test, y_test, color="blue", label="Actual")
plt.plot(X_test, y_pred, color="red", label="Predicted")
plt.xlabel("Average number of rooms (AveRooms)")
plt.ylabel("Median value of homes ($100,000)")
plt. tle("Linear Regression - California Housing Dataset")
plt.legend()
plt.show()
print("Linear Regression - California Housing Dataset")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))
def polynomial_regression_auto_mpg():
url = "h ps://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
column_names = ["mpg", "cylinders", "displacement", "horsepower", "weight", "accelera on", "model_year",
"origin"]
data = pd.read_csv(url, sep='\\s+', names=column_names, na_values="?")
data = data.dropna()
X = data["displacement"].values.reshape(-1, 1)
y = data["mpg"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
poly_model = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(), LinearRegression())
poly_model.fit(X_train, y_train)
y_pred = poly_model.predict(X_test)
plt.sca er(X_test, y_test, color="blue", label="Actual")
plt.sca er(X_test, y_pred, color="red", label="Predicted")
plt.xlabel("Displacement")
plt.ylabel("Miles per gallon (mpg)")
plt. tle("Polynomial Regression - Auto MPG Dataset")
plt.legend()
plt.show()
print("Polynomial Regression - Auto MPG Dataset")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R^2 Score:", r2_score(y_test, y_pred))
if __name__ == "__main__":
print("Demonstra ng Linear Regression and Polynomial Regression\n")
linear_regression_california()
polynomial_regression_auto_mpg()
lab 8:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn.model_selec on import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
data = load_breast_cancer()
X = data.data
y = data.target
plt.figure(figsize=(12,8))
tree.plot_tree(clf, filled=True, feature_names=data.feature_names,
class_names=data.target_names)
plt. tle("Decision Tree - Breast Cancer Dataset")
plt.show()
lab 9:
import numpy as np
from sklearn.datasets import fetch_olive _faces
from sklearn.model_selec on import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classifica on_report, confusion_matrix
import matplotlib.pyplot as plt
lab 10:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposi on import PCA
from sklearn.metrics import confusion_matrix, classifica on_report
data = load_breast_cancer()
X = data.data
y = data.target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
plt.figure(figsize=(8, 6))
sns.sca erplot(data=df, x='PC1', y='PC2', hue='Cluster', pale e='Set1', s=100, edgecolor='black', alpha=0.7)
plt. tle('K-Means Clustering of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend( tle="Cluster")
plt.show()
plt.figure(figsize=(8, 6))
sns.sca erplot(data=df, x='PC1', y='PC2', hue='True Label', pale e='coolwarm', s=100, edgecolor='black', alpha=0.7)
plt. tle('True Labels of Breast Cancer Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend( tle="True Label")
plt.show()
plt.figure(figsize=(8, 6))
sns.sca erplot(data=df, x='PC1', y='PC2', hue='Cluster', pale e='Set1', s=100, edgecolor='black', alpha=0.7)
centers = pca.transform(kmeans.cluster_centers_)
plt.sca er(centers[:, 0], centers[:, 1], s=200, c='red', marker='X', label='Centroids')
plt. tle('K-Means Clustering with Centroids')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend( tle="Cluster")
plt.show()