0% found this document useful (0 votes)
9 views10 pages

ML Lab P-1

The document contains multiple implementations of machine learning algorithms, including the Find-S algorithm, Candidate-Elimination algorithm, ID3 decision tree, artificial neural networks, Naïve Bayes classifier, and clustering techniques like EM and K-Means. Each section provides code examples for loading datasets, training models, and evaluating their performance. Additionally, it includes a Java implementation for a Naïve Bayes text classifier and a Bayesian network for heart disease prediction.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
9 views10 pages

ML Lab P-1

The document contains multiple implementations of machine learning algorithms, including the Find-S algorithm, Candidate-Elimination algorithm, ID3 decision tree, artificial neural networks, Naïve Bayes classifier, and clustering techniques like EM and K-Means. Each section provides code examples for loading datasets, training models, and evaluating their performance. Additionally, it includes a Java implementation for a Naïve Bayes text classifier and a Bayesian network for heart disease prediction.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 10

1.....................................................................................................

......................................................................................................
#Implement and demonstrate the FIND-Salgorithm
import pandas as pd

def load_data(filename):
"""Load dataset from a CSV file."""
return pd.read_csv(filename)

def find_s_algorithm(data):
"""Implements the Find-S algorithm for learning the most specific hypothesis."""
# Extract attributes and target column
attributes = data.iloc[:, :-1].values # All columns except last
target = data.iloc[:, -1].values # Last column (Yes/No)

# Initialize the most specific hypothesis (ϕ, ϕ, ϕ, ...)


hypothesis = ["ϕ"] * len(attributes[0])

# Find first positive example to initialize hypothesis


for i in range(len(target)):
if target[i].lower() == "yes":
hypothesis = attributes[i].copy()
break

# Iterate over all examples to refine hypothesis


for i in range(len(target)):
if target[i].lower() == "yes":
for j in range(len(hypothesis)):
if hypothesis[j] != attributes[i][j]:
hypothesis[j] = "?"

return hypothesis

if __name__ == "__main__":
# Load dataset
filename = r"C:\Users\rahul\OneDrive\Desktop\mldata\training_data.csv" # Change
filename as needed
data = load_data(filename)

# Run Find-S algorithm


specific_hypothesis = find_s_algorithm(data)

# Print the result


print("Most Specific Hypothesis:", specific_hypothesis)

2...................................................................................
....................................................................................
import pandas as pd
import numpy as np

def load_data(filename):
"""Load training data from a CSV file."""
data = pd.read_csv(r'C:\Users\rahul\OneDrive\Desktop\mldata\training_data.csv')
return data

def candidate_elimination(data):
"""Implements the Candidate-Elimination Algorithm."""
attributes = data.columns[:-1] # Exclude target column
target = data.columns[-1] # Target column

# Initialize G to the most general hypothesis


G = [['?' for _ in range(len(attributes))]] # General boundary
# Initialize S to the most specific hypothesis
S = ['ϕ' for _ in range(len(attributes))] # Specific boundary

for i, row in data.iterrows():


instance = row[:-1].values # Feature values
label = row.iloc[-1] # Correct way to access the last column
# Class label

if label == 'Yes': # Positive example


# Remove inconsistent hypotheses from G
G = [g for g in G if is_consistent(instance, g)]

# Generalize S if necessary
for j in range(len(S)):
if S[j] == 'ϕ': # Initialize S with first positive example
S = instance.copy()
elif S[j] != instance[j]:
S[j] = '?' # Generalize S

elif label == 'No': # Negative example


# Remove inconsistent hypotheses from S
S = [S] if is_consistent(instance, S) else []

# Specialize G where necessary


new_G = []
for g in G:
if is_consistent(instance, g):
for j in range(len(g)):
if g[j] == '?':
for val in np.unique(data.iloc[:, j]):
if val != instance[j]:
new_hypothesis = g.copy()
new_hypothesis[j] = val
new_G.append(new_hypothesis)
else:
new_G.append(g)
G = new_G

return S, G

def is_consistent(instance, hypothesis):


"""Checks if an instance is consistent with a hypothesis."""
for i in range(len(instance)):
if hypothesis[i] != '?' and hypothesis[i] != instance[i]:
return False
return True

if __name__ == "__main__":
filename = "training_data.csv" # Replace with actual CSV filename
data = load_data(filename)
S_final, G_final = candidate_elimination(data)
print("Final Specific Hypothesis:", S_final)
print("Final General Hypotheses:", G_final)

3..................................................................................
....................................................................................

#Write a program to demonstrate the working of the decision tree based ID3 algorithm. Use
an appropriate data set for building the decision tree and apply this knowledge toclassify a
new sample

import numpy as np
import pandas as pd
from collections import Counter

# Function to calculate entropy


def entropy(data):
labels = data.iloc[:, -1]
label_counts = Counter(labels)
total = len(labels)
return -sum((count / total) * np.log2(count / total) for count in label_counts.values())

# Function to calculate information gain


def info_gain(data, split_attribute):
total_entropy = entropy(data)
values, counts = np.unique(data[split_attribute], return_counts=True)
weighted_entropy = sum((counts[i] / sum(counts)) * entropy(data[data[split_attribute] ==
values[i]]) for i in range(len(values)))
return total_entropy - weighted_entropy

# Function to build the ID3 decision tree


def id3(data, attributes):
labels = data.iloc[:, -1]
if len(set(labels)) == 1:
return labels.iloc[0]
if len(attributes) == 0:
return labels.mode()[0]

best_attr = max(attributes, key=lambda attr: info_gain(data, attr))


tree = {best_attr: {}}

for value in np.unique(data[best_attr]):


subset = data[data[best_attr] == value].drop(columns=[best_attr])
tree[best_attr][value] = id3(subset, [attr for attr in attributes if attr != best_attr])

return tree

# Function to classify a new sample


def classify(tree, sample):
if not isinstance(tree, dict):
return tree
attribute = next(iter(tree))
if sample[attribute] in tree[attribute]:
return classify(tree[attribute][sample[attribute]], sample)
else:
return "Unknown"

# Sample dataset (PlayTennis)


data = pd.DataFrame({
'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 'Sunny', 'Sunny', 'Rain',
'Sunny', 'Overcast', 'Overcast', 'Rain'],
'Temperature': ['Hot', 'Hot', 'Hot', 'Mild', 'Cool', 'Cool', 'Cool', 'Mild', 'Cool', 'Mild', 'Mild', 'Mild',
'Hot', 'Mild'],
'Humidity': ['High', 'High', 'High', 'High', 'Normal', 'Normal', 'Normal', 'High', 'Normal', 'Normal',
'Normal', 'High', 'Normal', 'High'],
'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Weak', 'Weak',
'Strong', 'Strong', 'Weak', 'Strong'],
'PlayTennis': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
})

# Build the decision tree


tree = id3(data, ['Outlook', 'Temperature', 'Humidity', 'Wind'])
print("Decision Tree:")
print(tree)

# Classify a new sample


new_sample = {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Wind': 'Strong'}
result = classify(tree, new_sample)
print("Classification Result:", result)

4........................................................................................................
.........................................................................................................
import numpy as np

# Activation function (Sigmoid) and its derivative


def sigmoid(x):
return 1 / (1 + np.exp(-x))

def sigmoid_derivative(x):
return x * (1 - x)

# Training data (XOR problem)


X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([[0], [1], [1], [0]])

# Initialize weights and biases


input_layer_neurons = 2
hidden_layer_neurons = 2
output_layer_neurons = 1

np.random.seed(42)
weights_input_hidden = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
weights_hidden_output = np.random.uniform(size=(hidden_layer_neurons,
output_layer_neurons))
bias_hidden = np.random.uniform(size=(1, hidden_layer_neurons))
bias_output = np.random.uniform(size=(1, output_layer_neurons))

# Training the neural network


learning_rate = 0.5
epochs = 10000

for epoch in range(epochs):


# Forward pass
hidden_layer_activation = np.dot(X, weights_input_hidden) + bias_hidden
hidden_layer_output = sigmoid(hidden_layer_activation)
output_layer_activation = np.dot(hidden_layer_output, weights_hidden_output) +
bias_output
predicted_output = sigmoid(output_layer_activation)

# Backpropagation
error = y - predicted_output
d_predicted_output = error * sigmoid_derivative(predicted_output)

error_hidden_layer = d_predicted_output.dot(weights_hidden_output.T)
d_hidden_layer = error_hidden_layer * sigmoid_derivative(hidden_layer_output)

# Update weights and biases


weights_hidden_output += hidden_layer_output.T.dot(d_predicted_output) * learning_rate
bias_output += np.sum(d_predicted_output, axis=0, keepdims=True) * learning_rate
weights_input_hidden += X.T.dot(d_hidden_layer) * learning_rate
bias_hidden += np.sum(d_hidden_layer, axis=0, keepdims=True) * learning_rate

# Testing the trained ANN


def predict(sample):
hidden_layer_activation = np.dot(sample, weights_input_hidden) + bias_hidden
hidden_layer_output = sigmoid(hidden_layer_activation)
output_layer_activation = np.dot(hidden_layer_output, weights_hidden_output) +
bias_output
return sigmoid(output_layer_activation)

# Test samples
test_samples = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
predictions = predict(test_samples)
print("Predictions:")
print(predictions)

5.....................................................................................................
........................................................................................................

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load dataset from CSV file


def load_data(filename):
return pd.read_csv(filename)

# Train Naïve Bayes classifier


def train_naive_bayes(data):
X = data.iloc[:, :-1] # Features
y = data.iloc[:, -1] # Labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = GaussianNB()
model.fit(X_train, y_train)
return model, X_train, X_test, y_train, y_test

# Compute accuracy of the classifier


def compute_accuracy(model, X_test, y_test):
y_pred = model.predict(X_test)
return accuracy_score(y_test, y_pred)
# Main function
def main():
filename = 'dataset.csv' # Change this to your actual dataset file
data = load_data(filename)
model, X_train, X_test, y_train, y_test = train_naive_bayes(data)
accuracy = compute_accuracy(model, X_test, y_test)
print(f'Naïve Bayes Classifier Accuracy: {accuracy * 100:.2f}%')

# Test with a few test data sets


sample_tests = X_test[:5]
predictions = model.predict(sample_tests)
print("Sample Test Predictions:")
for i, pred in enumerate(predictions):
print(f'Test {i+1}: Predicted - {pred}, Actual - {y_test.iloc[i]}')

if __name__ == "__main__":
main()

6...............................................................................
.................................................................................

import java.io.*;
import java.util.*;
import weka.classifiers.bayes.NaiveBayes;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.converters.ConverterUtils.DataSource;
import weka.classifiers.Evaluation;

public class NaiveBayesTextClassifier {


public static void main(String[] args) {
try {
// Load dataset from ARFF file (convert CSV to ARFF if needed)
DataSource source = new DataSource("dataset.arff");
Instances dataset = source.getDataSet();

// Set class attribute (last column as label)


dataset.setClassIndex(dataset.numAttributes() - 1);

// Split dataset into training and testing sets (80% train, 20% test)
int trainSize = (int) Math.round(dataset.numInstances() * 0.8);
int testSize = dataset.numInstances() - trainSize;
Instances trainSet = new Instances(dataset, 0, trainSize);
Instances testSet = new Instances(dataset, trainSize, testSize);

// Train Naïve Bayes model


NaiveBayes model = new NaiveBayes();
model.buildClassifier(trainSet);

// Evaluate the model


Evaluation eval = new Evaluation(trainSet);
eval.evaluateModel(model, testSet);

// Print evaluation metrics


System.out.println("Accuracy: " + (1 - eval.errorRate()) * 100 + "%");
System.out.println("Precision: " + eval.precision(1));
System.out.println("Recall: " + eval.recall(1));
System.out.println("F1 Score: " + eval.fMeasure(1));
} catch (Exception e) {
e.printStackTrace();
}
}
}

7.....................................................................................
......................................................................................

import pandas as pd
from pgmpy.models import BayesianModel
from pgmpy.estimators import MaximumLikelihoodEstimator
from pgmpy.inference import VariableElimination

# Load Heart Disease dataset


data = pd.read_csv("heart_disease.csv")

# Define the Bayesian Network structure


model = BayesianModel([
('Age', 'HeartDisease'), ('Sex', 'HeartDisease'),
('ChestPain', 'HeartDisease'), ('Cholesterol', 'HeartDisease'),
('BloodPressure', 'HeartDisease')
])

# Learn CPDs (Conditional Probability Distributions) using Maximum Likelihood Estimation


model.fit(data, estimator=MaximumLikelihoodEstimator)

# Perform inference
inference = VariableElimination(model)

# Example query: Probability of Heart Disease given some conditions


query_result = inference.query(variables=['HeartDisease'], evidence={'Age': 55, 'Cholesterol':
230})
print(query_result)

8...........................................................................................
............................................................................................
import pandas as pd
import numpy as np
from sklearn.mixture import GaussianMixture
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

# Load dataset from CSV file


def load_data(filename):
data = pd.read_csv(filename)
return data

# Apply EM algorithm (Gaussian Mixture Model) for clustering


def em_clustering(data, n_clusters):
gmm = GaussianMixture(n_components=n_clusters, random_state=42)
labels = gmm.fit_predict(data)
return labels
# Apply k-Means clustering
def kmeans_clustering(data, n_clusters):
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
labels = kmeans.fit_predict(data)
return labels

# Main function
def main():
filename = 'dataset.csv' # Change this to your dataset file
data = load_data(filename)

# Preprocess data (Standardization)


scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

n_clusters = 3 # Change the number of clusters as needed

# Apply EM clustering
em_labels = em_clustering(data_scaled, n_clusters)

# Apply k-Means clustering


kmeans_labels = kmeans_clustering(data_scaled, n_clusters)

# Evaluate clustering quality using Silhouette Score


em_silhouette = silhouette_score(data_scaled, em_labels)
kmeans_silhouette = silhouette_score(data_scaled, kmeans_labels)

print(f'EM Clustering Silhouette Score: {em_silhouette:.4f}')


print(f'K-Means Clustering Silhouette Score: {kmeans_silhouette:.4f}')

if em_silhouette > kmeans_silhouette:


print("EM algorithm provides better clustering quality.")
else:
print("K-Means provides better clustering quality.")

if __name__ == "__main__":
main()
9...................................................................
....................................................................

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset


df = pd.read_csv("iris.csv") # Ensure the file is in the correct directory

# Split dataset into features and target variable


X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Split into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train k-NN classifier


k = 3 # Choose k value
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

# Print correct and incorrect predictions


correct = []
incorrect = []
for i in range(len(y_test)):
if y_pred[i] == y_test.iloc[i]:
correct.append((X_test[i], y_pred[i]))
else:
incorrect.append((X_test[i], y_pred[i], y_test.iloc[i]))

print("\nCorrect Predictions:")
for item in correct:
print(f'Predicted: {item[1]}')

print("\nIncorrect Predictions:")
for item in incorrect:
print(f'Predicted: {item[1]}, Actual: {item[2]}')

10..............................................................................................
................................................................................................

import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def kernel(x, x_point, tau):


return np.exp(-np.sum((x - x_point) ** 2, axis=1) / (2 * tau ** 2))

def locally_weighted_regression(X_train, y_train, x_query, tau):


W = np.diag(kernel(X_train, x_query, tau))
theta = np.linalg.pinv(X_train.T @ W @ X_train) @ X_train.T @ W @ y_train
return x_query @ theta

# Generate synthetic dataset


np.random.seed(42)
X = np.linspace(-5, 5, 100).reshape(-1, 1)
y = np.sin(X).ravel() + np.random.normal(0, 0.2, X.shape[0])

# Add bias term


X_bias = np.c_[np.ones(X.shape[0]), X]
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_bias, y, test_size=0.2, random_state=42)

# Fit Locally Weighted Regression


tau = 0.5 # Bandwidth parameter
y_pred = np.array([locally_weighted_regression(X_train, y_train, x, tau) for x in X_test])

# Plot results
plt.scatter(X[:, 1], y, label='Data', color='blue', alpha=0.5)
plt.scatter(X_test[:, 1], y_pred, label='Predictions', color='red', marker='x')
plt.xlabel('X')
plt.ylabel('y')
plt.legend()
plt.title('Locally Weighted Regression')
plt.show()

You might also like