3rd program:
import ast
import csv
#import sys
import math
import os
def load_csv_to_header_data(filename):
path = [Link]([Link]() + filename)
''' [Link](path)
Normalize a pathname by collapsing redundant separators and up-level references so
that A//B, A/B/, A/./B and A/foo/../B all become A/B. This string manipulation may
change the meaning of a path that contains symbolic links. On Windows, it converts
forward slashes to backward slashes. To normalize case, use normcase().'''
print(path)
fs = [Link](open(path))
all_row = []
for r in fs:
all_row.append(r)
headers = all_row[0]
idx_to_name, name_to_idx = get_header_name_to_idx_maps(headers)
data = { 'header': headers,'rows': all_row[1:],'name_to_idx':
name_to_idx,'idx_to_name': idx_to_name}
return data
def get_header_name_to_idx_maps(headers):
name_to_idx = {}
idx_to_name = {}
for i in range(0, len(headers)):
name_to_idx[headers[i]] = i
idx_to_name[i] = headers[i]
#print(name_to_idx)
#print(idx_to_name)
return idx_to_name, name_to_idx
def project_columns(data, columns_to_project):
data_h = list(data['header'])
data_r = list(data['rows'])
all_cols = list(range(0,len(data_h)))
columns_to_project_ix = [data['name_to_idx'][name] for name in
columns_to_project]
#print(columns_to_project_ix)
columns_to_remove = [cidx for cidx in all_cols if cidx not in
columns_to_project_ix]
#print(columns_to_remove)
for delc in sorted(columns_to_remove, reverse=True):
del data_h[delc]
for r in data_r:
del r[delc]
idx_to_name, name_to_idx = get_header_name_to_idx_maps(data_h)
return {'header': data_h, 'rows': data_r,'name_to_idx':
name_to_idx,'idx_to_name': idx_to_name}
def get_uniq_values(data):
idx_to_name = data['idx_to_name']
idxs = idx_to_name.keys()
#print(idxs)
val_map = {}
for idx in iter(idxs):
val_map[idx_to_name[idx]] = set()
#print(val_map)
for data_row in data['rows']:
for idx in idx_to_name.keys():
att_name = idx_to_name[idx]
val = data_row[idx]
if val not in val_map.values():
val_map[att_name].add(val)
#print(val_map)
return val_map
def get_class_labels(data,target_attribute):
rows = data['rows']
#print(rows)
col_idx = data['name_to_idx'][target_attribute]
#print(col_idx)
labels = {}
for r in rows:
val = r[col_idx]
if val in labels:
labels[val] = labels[val] + 1
else:
labels[val] = 1
#print(labels)
return labels
def entropy(n, labels):
ent = 0
for label in [Link]():
p_x = labels[label] / n
ent += - p_x * [Link](p_x, 2)
return ent
def partition_data(data, group_att):
partitions = {}
data_rows = data['rows']
partition_att_idx = data['name_to_idx'][group_att]
#print(partition_att_idx)
for row in data_rows:
row_val = row[partition_att_idx]
#print(row_val)
if row_val not in [Link]():
partitions[row_val] = {'name_to_idx':
data['name_to_idx'],'idx_to_name': data['idx_to_name'],'rows': list()}
partitions[row_val]['rows'].append(row)
#print(partitions)
return partitions
def avg_entropy_w_partitions(data, splitting_att, target_attribute): # find
uniq values of splitting att
data_rows = data['rows']
n = len(data_rows)
partitions = partition_data(data, splitting_att)
avg_ent = 0
#p=[Link]()
#print(p)
for partition_key in [Link]():
partitioned_data = partitions[partition_key]
partition_n = len(partitioned_data['rows'])
partition_labels = get_class_labels(partitioned_data, target_attribute)
partition_entropy = entropy(partition_n, partition_labels)
avg_ent += partition_n / n * partition_entropy
return avg_ent, partitions
def most_common_label(labels):
mcl = max(labels, key=lambda k: labels[k])
return mcl
def id3(data, uniqs, remaining_atts, target_attribute):
labels = get_class_labels(data, target_attribute)
#print(labels)
node = {}
# a=len([Link]())
# print(a)
if len([Link]()) == 1:
node['label'] = next(iter([Link]()))
#print(node)
return node
#print(labels)
if len(remaining_atts) == 0:
node['label'] = most_common_label(labels)
return node
n = len(data['rows'])
ent = entropy(n, labels)
max_info_gain = None
max_info_gain_att = None
max_info_gain_partitions = None
for remaining_att in remaining_atts:
avg_ent, partitions = avg_entropy_w_partitions(data, remaining_att,
target_attribute)
info_gain = ent - avg_ent
if max_info_gain is None or info_gain > max_info_gain:
max_info_gain = info_gain
max_info_gain_att = remaining_att
max_info_gain_partitions = partitions
if max_info_gain is None:
node['label'] = most_common_label(labels)
return node
node['attribute'] = max_info_gain_att
node['nodes'] = {}
remaining_atts_for_subtrees = set(remaining_atts)
remaining_atts_for_subtrees.discard(max_info_gain_att)
uniq_att_values = uniqs[max_info_gain_att]
for att_value in uniq_att_values:
if att_value not in max_info_gain_partitions.keys():
node['nodes'][att_value] = {'label': most_common_label(labels)}
continue
partition = max_info_gain_partitions[att_value]
node['nodes'][att_value] = id3(partition, uniqs,
remaining_atts_for_subtrees, target_attribute)
return node
def load_config(config_file):
with open(config_file, 'r') as myfile:
data = [Link]().replace('\n', '')
print(data)
return ast.literal_eval(data)
'''ast.literal_eval(node_or_string)
Safely evaluate an expression node or a string containing a Python literal or
container display. The string or node provided may only consist of the following
Python literal structures: strings, bytes, numbers, tuples, lists, dicts, sets,
booleans, and None.
This can be used for safely evaluating strings containing Python values from
untrusted sources without the need to parse the values oneself. It is not capable
of evaluating arbitrarily complex expressions, for example involving operators or
indexing.'''
def pretty_print_tree(root):
stack = []
rules = set()
def traverse(node, stack, rules):
if 'label' in node:
[Link](' THEN ' + node['label'])
[Link](''.join(stack))
[Link]()
elif 'attribute' in node:
ifnd = 'IF ' if not stack else ' AND '
[Link](ifnd + node['attribute'] + ' EQUALS ')
for subnode_key in node['nodes']:
[Link](subnode_key)
traverse(node['nodes'][subnode_key], stack, rules)
[Link]()
[Link]()
traverse(root, stack, rules)
print([Link](rules))
def main():
argv ='[Link]'
print("Command line args are {}: ".format(argv))
config = load_config(argv)
print(config)
data = load_csv_to_header_data(config['data_file'])
data = project_columns(data, config['data_project_columns'])
target_attribute = config['target_attribute']
remaining_attributes = set(data['header'])
remaining_attributes.remove(target_attribute)
print(remaining_attributes)
uniqs = get_uniq_values(data)
root = id3(data, uniqs, remaining_attributes, target_attribute)
pretty_print_tree(root)
if __name__ == "__main__": main()
4th program:
from math import exp
from random import seed
from random import random
# Initialize a network
def initialize_network(n_inputs, n_hidden, n_outputs):
network = list()
hidden_layer = [{'weights':[random() for i in range(n_inputs + 1)]} for i in range(n_hidden)]
[Link](hidden_layer)
#print(network)
output_layer = [{'weights':[random() for i in range(n_hidden + 1)]} for i in range(n_outputs)]
[Link](output_layer)
#print(network)
return network
# Calculate neuron activation for an input
def activate(weights, inputs):
activation = weights[-1]
for i in range(len(weights)-1):
activation += weights[i] * inputs[i]
return activation
# Transfer neuron activation
def transfer(activation):
return 1.0 / (1.0 + exp(-activation))
# Forward propagate input to a network output
def forward_propagate(network, row):
inputs = row
for layer in network:
#print(layer)
new_inputs = []
for neuron in layer:
activation = activate(neuron['weights'], inputs)
neuron['output'] = transfer(activation)
new_inputs.append(neuron['output'])
inputs = new_inputs
#print(inputs)
#print(inputs)
return inputs
# Calculate the derivative of an neuron output
def transfer_derivative(output):
return output * (1.0 - output)
# Backpropagate error and store in neurons
def backward_propagate_error(network, expected):
for i in reversed(range(len(network))):
layer = network[i]
#print(layer)
errors = list()
if i != len(network)-1:
for j in range(len(layer)):
error = 0.0
for neuron in network[i + 1]:
error += (neuron['weights'][j] * neuron['delta'])
[Link](error)
else:
for j in range(len(layer)):
neuron = layer[j]
[Link](expected[j] - neuron['output'])
for j in range(len(layer)):
neuron = layer[j]
neuron['delta'] = errors[j] * transfer_derivative(neuron['output'])
# Update network weights with error
def update_weights(network, row, l_rate):
for i in range(len(network)):
inputs = row[:-1]
if i != 0:
inputs = [neuron['output'] for neuron in network[i - 1]]
for neuron in network[i]:
for j in range(len(inputs)):
neuron['weights'][j] += l_rate * neuron['delta'] * inputs[j]
neuron['weights'][-1] += l_rate * neuron['delta']
# Train a network for a fixed number of epochs
def train_network(network, train, l_rate, n_epoch, n_outputs):
for epoch in range(n_epoch):
sum_error = 0
for row in train:
outputs = forward_propagate(network, row)
#print(outputs)
expected = [0 for i in range(n_outputs)]
#print(expected)
expected[row[-1]] = 1
#print(expected)
sum_error += sum([(expected[i]-outputs[i])**2 for i in range(len(expected))])
#print(sum_error)
backward_propagate_error(network, expected)
update_weights(network, row, l_rate)
print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
# Test training backprop algorithm
seed(1)
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
n_inputs = len(dataset[0]) - 1
#print(n_inputs)
n_outputs = len(set([row[-1] for row in dataset]))
#print(n_outputs)
network = initialize_network(n_inputs, 2, n_outputs)
train_network(network, dataset, 0.5, 20, n_outputs)
for layer in network:
print(layer)
5th program:
print("\nNaive Bayes Classifier for concept learning problem")
import csv
#import random
import math
#import operator
def safe_div(x,y):
if y == 0:
return 0
return x / y
def loadCsv(filename):
lines = [Link](open(filename))
dataset = list(lines)
for i in range(len(dataset)):
dataset[i] = [float(x) for x in dataset[i]]
return dataset
def splitDataset(dataset, splitRatio):
trainSize = int(len(dataset) * splitRatio)
trainSet = []
copy = list(dataset)
i=0
while len(trainSet) < trainSize:
#index = [Link](len(copy))
[Link]([Link](i))
return [trainSet, copy]
def separateByClass(dataset):
separated = {}
for i in range(len(dataset)):
vector = dataset[i]
if (vector[-1] not in separated):
separated[vector[-1]] = []
separated[vector[-1]].append(vector)
#print(separated)
return separated
def mean(numbers):
return safe_div(sum(numbers),float(len(numbers)))
def stdev(numbers):
avg = mean(numbers)
variance = safe_div(sum([pow(x-avg,2) for x in numbers]),float(len(numbers)-1))
return [Link](variance)
def summarize(dataset):
#for attribute in zip(*dataset):
#print(attribute)
summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
del summaries[-1]
return summaries
def summarizeByClass(dataset):
separated = separateByClass(dataset)
summaries = {}
#p=[Link]();
#print(p)
for classValue, instances in [Link]():
# print(classValue)
#print(instances)
summaries[classValue] = summarize(instances)
#print(summaries)
return summaries
def calculateProbability(x, mean, stdev):
exponent = [Link](-safe_div([Link](x-mean,2),(2*[Link](stdev,2))))
final = safe_div(1 , ([Link](2*[Link]) * stdev)) * exponent
return final
def calculateClassProbabilities(summaries, inputVector):
probabilities = {}
for classValue, classSummaries in [Link]():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, stdev = classSummaries[i]
x = inputVector[i]
probabilities[classValue] *= calculateProbability(x, mean, stdev)
return probabilities
def predict(summaries, inputVector):
probabilities = calculateClassProbabilities(summaries, inputVector)
bestLabel, bestProb = None, -1
for classValue, probability in [Link]():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(summaries, testSet):
predictions = []
for i in range(len(testSet)):
result = predict(summaries, testSet[i])
[Link](result)
return predictions
def getAccuracy(testSet, predictions):
correct = 0
for i in range(len(testSet)):
if testSet[i][-1] == predictions[i]:
correct += 1
accuracy = safe_div(correct,float(len(testSet))) * 100.0
return accuracy
def main():
filename = '[Link]'
splitRatio = 0.75
dataset = loadCsv(filename)
trainingSet, testSet = splitDataset(dataset, splitRatio)
print('Split {0} rows into'.format(len(dataset)))
print('Number of Training data: ' + (repr(len(trainingSet))))
print('Number of Test Data: ' + (repr(len(testSet))))
print("\nThe values assumed for the concept learning attributes are\n")
print("OUTLOOK=> Sunny=1 Overcast=2 Rain=3\nTEMPERATURE=> Hot=1 Mild=2
Cool=3\nHUMIDITY=> High=1 Normal=2\nWIND=> Weak=1 Strong=2")
print("TARGET CONCEPT:PLAY TENNIS=> Yes=10 No=5")
print("\nThe Training set are:")
for x in trainingSet:
print(x)
print("\nThe Test data set are:")
for x in testSet:
print(x)
print("\n")
# prepare model
summaries = summarizeByClass(trainingSet)
# test model
predictions = getPredictions(summaries, testSet)
actual = []
for i in range(len(testSet)):
vector = testSet[i]
[Link](vector[-1])
# Since there are five attribute values, each attribute constitutes to 20% accuracy. So if all
attributes match with predictions then 100% accuracy
print('Actual values: {0}%'.format(actual))
print('Predictions: {0}%'.format(predictions))
accuracy = getAccuracy(testSet, predictions)
print('Accuracy: {0}%'.format(accuracy))
main()