ML LAB Manual-1
ML LAB Manual-1
import statistics
# initializing list
li = [1,2, 3, 3, 2, 2, 2,1]
# Median
median = statistics.median(data)
print("Median:", median)
# Mode mode =
statistics.mode(data)
print("Mode:", mode)
# Variance
variance = statistics.variance(data)
print("Variance:", variance)
# Standard Deviation
std_deviation = statistics.stdev(data)
print("Standard Deviation:", std_deviation)
OUTPUT:
Median: 2.0
Mode: 2
Variance: 0.5714285714285714
Standard Deviation: 0.7559289460184545
PROGRAM:
#Apply the following Pre-processing techniques for a given dataset.
#Attribute selection
#Handling Missing Values
#Discretization
#Elimination of Outliers
A. Attribute selection
METHOD1: Display Attributes or Features from Dataframe
import pandas as pd
# Creating the dataframe as shown above
df = pd.DataFrame({'Job Position': ['CEO', 'Senior Manager',
'Junior Manager', 'Employee',
'Assistant Staff'], 'Years of Experience':[5, 4, 3, , 1],
Salary':[100000,80000,90000,40000,
20000]})
print(df.columns)
y_data = load_iris().target
METHOD 3:
import pandas as pd
# Load the dataset (replace 'your_data.csv' with the actual path)
data = pd.read_csv('your_data.csv') # Display the feature names
print("Feature Names:")
print(data.columns)
OUTPUT:
Index(['Job Position', 'Years of Experience', 'Salary'], dtype='object')
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
B. Handling Missing Values
import pandas as pd
OUTPUT:
# Sample data
data = {'age': [25, 30, 28, 45, 50, 35, 22, 60]}
df = pd.DataFrame(data)
OUTPUT:
Age age_category
0 25 Young
1 30 Middle Aged
2 28 Young
3 45 Old
4 50 Old
5 35 Middle Aged
6 22 Young
7 60 Old
D. Elimination of Outliers
METHOD1: Boxplots
# Sample data
sample = [15, 105, 18, 7, 13, 16, 11, 21, 5, 15, 10, 9]
METHOD2: Z-SCORE
import numpy as np
def detect_outliers_zscore(data):
outliers = [] # define inside the function to avoid accumulating from
previous calls
threshold = 3
mean = np.mean(data)
std = np.std(data)
for i in data:
z_score = (i - mean) / std
if np.abs(z_score) > threshold:
outliers.append(i)
return outliers
# Sample data (assuming you defined 'sample' before, like in the boxplot
example)
sample = [15, 105, 18, 7, 13, 16, 11, 21, 5, 15, 10, 9]
import numpy as np
def detect_outliers_iqr(data):
outliers = [] # Declare inside the function to avoid reusing from previous calls
data = sorted(data)
# Compute IQR
IQR = q3 - q1
# Define bounds
lower_bound = q1 - (1.5 * IQR)
upper_bound = q3 + (1.5 * IQR)
# Find outliers
for i in data:
if i < lower_bound or i > upper_bound:
outliers.append(i)
return outliers
OUTPUT:
Mean: 20.416666666666668
Standard Deviation: 25.882614284925356
Outliers from Z-score method: [105]
Q1: 9.75, Q3: 16.5, IQR: 6.75
Lower Bound: -0.375, Upper Bound: 26.625
Outliers from IQR met
PROGRAM:
#Apply KNN algorithm for classification and regression
from sklearn.neighbors import KNeighborsClassifier,
KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score, mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
optimal_k_class = k_range[np.argmax(accuracies)]
print(f"Optimal k for classification: {optimal_k_class}")
optimal_k_reg = k_range[np.argmin(rmse_values)]
print(f"Optimal k for regression: {optimal_k_reg}")
# Plot the results
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(k_range, accuracies, marker='o')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.title('KNN Classification Accuracy')
plt.subplot(1, 2, 2)
plt.plot(k_range, rmse_values, marker='o')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('RMSE')
plt.title('KNN Regression RMSE')
plt.tight_layout()
plt.show()
OUTPUT:
PROGRAM:
#Demonstrate decision tree algorithm for a classification problem and perform parameter tuning
#for better results
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
# Make predictions
y_pred = clf.predict(X_test)
# Make predictions
y_pred = regressor.predict(X_test)
# Make predictions
y_pred = rf_classifier.predict(X_test)
OUTPUT:
Accuracy: 100.00%
REGRESSION:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Make predictions
y_pred = rf_regressor.predict(X_test)
OUTPUT:
# Compute accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)
# Classification Report
print("\nClassification Report:\n", classification_report(
y_test, y_pred, target_names=data.target_names
))
OUTPUT: