ML Lab....... 3-Converted New
ML Lab....... 3-Converted New
ML Lab....... 3-Converted New
Sourcecode:
import numpy as np
import matplotlib.pyplot as plt
class GradientDescent():
def init (self, alpha=0.1, tolerance=0.02, max_iterations=500):
self._alpha = alpha
self._tolerance = tolerance
self._max_iterations = max_iterations
self._thetas = None
def fit(self, xs, ys):
num_examples, num_features = np.shape(xs)
self._thetas = np.ones(num_features)
xs_transposed = xs.transpose()
for i in range(self._max_iterations):
diffs = np.dot(xs,self._thetas) - ys
cost = np.sum(diffs**2) / (2*num_examples)
gradient = np.dot(xs_transposed, diffs) / num_examples
self._thetas = self._thetas-self._alpha*gradient
if cost < self._tolerance:
return self._thetas
return self._thetas
def predict(self, x):
return np.dot(x, self._thetas)
data = np.loadtxt("iris.data.txt", usecols=(0,1,2,3), delimiter=',')
col_names = ['sepal length', 'sepal width', 'petal length', 'petal width']
data_map = dict(zip(col_names, data.transpose()))
features = np.column_stack((data_map['petal length'], np.ones(len(data_map['petal length']))))
gd = GradientDescent(tolerance=0.022)
thetas = gd.fit(features, data_map['petal width'])
gradient, intercept = thetas
ys = gd.predict(features)
plt.scatter(data_map['petal length'], data_map['petal width'])
plt.plot(data_map['petal length'], data_map['petal width'])
plt.show()
output:
Sourcecode:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
path = os.getcwd() + '\data\ex2data1.txt'
data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
data.head()
positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
def sigmoid(z):
return 1 / (1 + np.exp(-z))
nums = np.arange(-10, 10, step=1)
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(nums, sigmoid(nums), 'r')
cost(theta, X, y)
Sourcecode:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
%matplotlib inline
data = loadmat('data/ex3data1.mat')
data
data['X'].shape, data['y'].shape
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
reg = (learningRate / 2 * len(X)) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
return np.sum(first - second) / (len(X)) + reg
def gradient_with_loop(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:,i])
if (i == 0):
grad[i] = np.sum(term) / len(X)
else:
grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,i])
return grad
def gradient(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
error = sigmoid(X * theta.T) - y
grad = ((X.T * error) / len(X)).T + ((learningRate / len(X)) * theta)
grad[0, 0] = np.sum(np.multiply(error, X[:,0])) / len(X)
return np.array(grad).ravel()
from scipy.optimize import minimize
def one_vs_all(X, y, num_labels, learning_rate):
rows = X.shape[0]
params = X.shape[1]
all_theta = np.zeros((num_labels, params + 1))
X = np.insert(X, 0, values=np.ones(rows), axis=1)
for i in range(1, num_labels + 1):
theta = np.zeros(params + 1)
y_i = np.array([1 if label == i else 0 for label in y])
y_i = np.reshape(y_i, (rows, 1))
fmin = minimize(fun=cost, x0=theta, args=(X, y_i, learning_rate), method='TNC',
jac=gradient)
all_theta[i-1,:] = fmin.x
return all_theta
rows = data['X'].shape[0]
params = data['X'].shape[1]
all_theta = np.zeros((10, params + 1))
X = np.insert(data['X'], 0, values=np.ones(rows), axis=1)
theta = np.zeros(params + 1)
y_0 = np.array([1 if label == 0 else 0 for label in data['y']])
y_0 = np.reshape(y_0, (rows, 1))
X.shape, y_0.shape, theta.shape, all_theta.shape
np.unique(data['y'])
np.unique(data['y'])
all_theta = one_vs_all(data['X'], data['y'], 10, 1)
all_theta
def predict_all(X, all_theta):
rows = X.shape[0]
params = X.shape[1]
num_labels = all_theta.shape[0]
X = np.insert(X, 0, values=np.ones(rows), axis=1)
X = np.matrix(X)
all_theta = np.matrix(all_theta)
h = sigmoid(X * all_theta.T)
h_argmax = np.argmax(h, axis=1)
h_argmax = h_argmax + 1
return h_argmax
y_pred = predict_all(data['X'], all_theta)
correct = [1 if a == b else 0 for (a, b) in zip(y_pred, data['y'])]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print ('accuracy = {0}%'.format(accuracy * 100))
accuracy = 74.6%
output:
{' header ': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16
13:09:09 2011',
Sourcecode:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
fruits = pd.read_table('fruit_data_with_colors.txt')
fruits.head()
print(fruits.shape)
print(fruits['fruit_name'].unique())
print(fruits.groupby('fruit_name').size())
import seaborn as sns
sns.countplot(fruits['fruit_name'],label="Count")
plt.show()
fruits.drop('fruit_label', axis=1).plot(kind='box', subplots=True, layout=(2,2), sharex=False, shar
ey=False, figsize=(9,9),
title='Box Plot for each input variable')
plt.savefig('fruits_box')
plt.show()
import pylab as pl
fruits.drop('fruit_label' ,axis=1).hist(bins=30, figsize=(9,9))
pl.suptitle("Histogram for each numeric input variable")
plt.savefig('fruits_hist')
plt.show()
import warnings
warnings.filterwarnings("ignore")
from pandas.tools.plotting import scatter_matrix
from matplotlib import cm
feature_names = ['mass', 'width', 'height', 'color_score']
X = fruits[feature_names]
y = fruits['fruit_label']
cmap = cm.get_cmap('gnuplot')
scatter = pd.scatter_matrix(X, c = y, marker = 'o', s=40, hist_kwds={'bins':15}, figsize=(9,9),
cmap = cmap)
plt.suptitle('Scatter-matrix for each input variable')
plt.savefig('fruits_scatter_matrix')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
.format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
.format(logreg.score(X_test, y_test)))
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
.format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
.format(clf.score(X_test, y_test)))
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
.format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
.format(knn.score(X_test, y_test)))
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
.format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
.format(svm.score(X_test, y_test)))
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = knn.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))
output:
fruit_label fruit_name fruit_subtype mass width height color_score
1 apple granny_smith 192 8.4 7.3 0.55
1 apple granny_smith 180 8.0 6.8 0.59
1 apple granny_smith 176 7.4 7.2 0.60
2 mandarin mandarin 86 6.2 4.7 0.80
2 mandarin mandarin 84 6.0 4.6 0.79
Accuracy of K-NN classifier on training set: 0.95
Accuracy of K-NN classifier on test set: 1.00
Accuracy of SVM classifier on training set: 0.61
Accuracy of SVM classifier on test set: 0.33
[[4 0 0 0]
[0 1 0 0]
[0 0 8 0]
[0 0 0 2]]
precision recall f1-score support
1 1.00 1.00 1.00 4
2 1.00 1.00 1.00 1
3 1.00 1.00 1.00 8 4 1.00 1.00 1.00 2 avg / total 1.00 1.00 1.00 15
7: Write a program to implement K-means Clustering Example
Sourcecode:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
%matplotlib inline
def find_closest_centroids(X, centroids):
m = X.shape[0]
k = centroids.shape[0]
idx = np.zeros(m)
for i in range(m):
min_dist = 1000000
for j in range(k):
dist = np.sum((X[i,:] - centroids[j,:]) ** 2)
if dist < min_dist:
min_dist = dist
idx[i] = j
return idx
data = loadmat('data/ex7data2.mat')
X = data['X']
initial_centroids = initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])
idx = find_closest_centroids(X, initial_centroids)
idx[0:3]
def compute_centroids(X, idx, k):
m, n = X.shape
centroids = np.zeros((k, n))
for i in range(k):
indices = np.where(idx == i)
centroids[i,:] = (np.sum(X[indices,:], axis=1) / len(indices[0])).ravel()
return centroids
compute_centroids(X, idx, 3)
def run_k_means(X, initial_centroids, max_iters):
m, n = X.shape
k = initial_centroids.shape[0]
idx = np.zeros(m)
centroids = initial_centroids
for i in range(max_iters):
for i in range(max_iters):
idx = find_closest_centroids(X, centroids)
centroids = compute_centroids(X, idx, k)
return idx, centroids
idx, centroids = run_k_means(X, initial_centroids, 10)
cluster1 = X[np.where(idx == 0)[0],:]
cluster2 = X[np.where(idx == 1)[0],:]
cluster3 = X[np.where(idx == 2)[0],:]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(cluster1[:,0], cluster1[:,1], s=30, color='r', label='Cluster 1')
ax.scatter(cluster2[:,0], cluster2[:,1], s=30, color='g', label='Cluster 2')
ax.scatter(cluster3[:,0], cluster3[:,1], s=30, color='b', label='Cluster 3')
ax.legend()
def init_centroids(X, k):
m, n = X.shape
centroids = np.zeros((k, n))
idx = np.random.randint(0, m, k)
for i in range(k):
centroids[i,:] = X[idx[i],:]
return centroids
init_centroids(X, 3)
image_data = loadmat('data/bird_small.mat')
image_data
A = image_data['A']
A.shape
A = A / 255.
# reshape the array
X = np.reshape(A, (A.shape[0] * A.shape[1], A.shape[2]))
# randomly initialize the centroids
initial_centroids = init_centroids(X, 16)
idx, centroids = run_k_means(X, initial_centroids, 10)
# get the closest centroids one last time
idx = find_closest_centroids(X, centroids)
# map each pixel to the centroid value
X_recovered = centroids[idx.astype(int),:]
# reshape to the original dimensions
X_recovered = np.reshape(X_recovered, (A.shape[0], A.shape[1], A.shape[2]))
plt.imshow(X_recovered)
output:
array([0., 2., 1.])
{' header ': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Tue Jun 5 04:06:24
2012', ' version ': '1.0',
...,
...,
...,
...,
...,
...,
...,
dtype=uint8)}
(128, 128, 3)
8 : Write a program to Implement KNN algorithms with example
Sourcecode:
# Importing libraries
import pandas as pd
import numpy as np
import math
import operator
data = pd.read_csv("iris.csv")
data.head()
def euclideanDistance(data1, data2, length):
distance = 0
for x in range(length):
distance += np.square(data1[x] - data2[x])
return np.sqrt(distance)
def knn(trainingSet, testInstance, k):
distances = {}
sort = {}
length = testInstance.shape[1]
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet.iloc[x], length)
distances[x] = dist[0]
sorted_d = sorted(distances.items(), key=operator.itemgetter(1))
neighbors = []
for x in range(k):
neighbors.append(sorted_d[x][0])
classVotes = {}
for x in range(len(neighbors)):
response = trainingSet.iloc[neighbors[x]][-1]
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
return(sortedVotes[0][0], neighbors)
testSet = [[7.2, 3.6, 5.1, 2.5]]
test = pd.DataFrame(testSet)
k=1
result,neigh = knn(data, test, k)
print(result)
print(neigh)
k=3
result,neigh = knn(data, test, k)
print(result)
print(neigh)
k=5
result,neigh = knn(data, test, k)
print(result)
In [12]:
print(neigh)
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(data.iloc[:,0:4], data['Name'])
print(neigh.predict(test))
print(neigh.kneighbors(test)[1])
output:
Sourcecode:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
%matplotlib inline
raw_data = loadmat('data/ex6data1.mat')
raw_data
data = pd.DataFrame(raw_data['X'], columns=['X1', 'X2'])
data['y'] = raw_data['y']
positive = data[data['y'].isin([1])]
negative = data[data['y'].isin([0])]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['X1'], positive['X2'], s=50, marker='x', label='Positive')
ax.scatter(negative['X1'], negative['X2'], s=50, marker='o', label='Negative')
ax.legend()
from sklearn import svm
svc = svm.LinearSVC(C=1, loss='hinge', max_iter=1000)
svc
svc.fit(data[['X1', 'X2']], data['y'])
svc.score(data[['X1', 'X2']], data['y'])
svc2 = svm.LinearSVC(C=100, loss='hinge', max_iter=1000)
svc2.fit(data[['X1', 'X2']], data['y'])
svc2.score(data[['X1', 'X2']], data['y'])
data['SVM 1 Confidence'] = svc.decision_function(data[['X1', 'X2']])
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(data['X1'], data['X2'], s=50, c=data['SVM 1 Confidence'], cmap='seismic')
ax.set_title('SVM (C=1) Decision Confidence')
data['SVM 2 Confidence'] = svc2.decision_function(data[['X1', 'X2']])
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(data['X1'], data['X2'], s=50, c=data['SVM 2 Confidence'], cmap='seismic')
ax.set_title('SVM (C=100) Decision Confidence')
def gaussian_kernel(x1, x2, sigma):
return np.exp(-(np.sum((x1 - x2) ** 2) / (2 * (sigma ** 2))))
x1 = np.array([1.0, 2.0, 1.0])
x2 = np.array([0.0, 4.0, -1.0])
sigma = 2
gaussian_kernel(x1, x2, sigma)
raw_data = loadmat('data/ex6data2.mat')
data = pd.DataFrame(raw_data['X'], columns=['X1', 'X2'])
data['y'] = raw_data['y']
positive = data[data['y'].isin([1])]
negative = data[data['y'].isin([0])]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['X1'], positive['X2'], s=30, marker='x', label='Positive')
ax.scatter(negative['X1'], negative['X2'], s=30, marker='o', label='Negative')
ax.legend()
svc = svm.SVC(C=100, gamma=10, probability=True)
svc.fit(data[['X1', 'X2']], data['y'])
data['Probability'] = svc.predict_proba(data[['X1', 'X2']])[:,0]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(data['X1'], data['X2'], s=30, c=data['Probability'], cmap='Reds')
raw_data = loadmat('data/ex6data3.mat')
X = raw_data['X']
Xval = raw_data['Xval']
y = raw_data['y'].ravel()
yval = raw_data['yval'].ravel()
C_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
gamma_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
best_score = 0
best_params = {'C': None, 'gamma': None}
for C in C_values:
for gamma in gamma_values:
svc = svm.SVC(C=C, gamma=gamma)
svc.fit(X, y)
score = svc.score(Xval, yval)
if score > best_score:
best_score = score
best_params['C'] = C
best_params['gamma'] = gamma
best_score, best_params
spam_train = loadmat('data/spamTrain.mat')
spam_test = loadmat('data/spamTest.mat')
spam_train
X = spam_train['X']
Xtest = spam_test['Xtest']
y = spam_train['y'].ravel()
ytest = spam_test['ytest'].ravel()
X.shape, y.shape, Xtest.shape, ytest.shape
svc = svm.SVC()
svc.fit(X, y)
print('Test accuracy = {0}%'.format(np.round(svc.score(Xtest, ytest) * 100, 2)))
output:
{' header ': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Nov 13
14:28:43 2011', ' version ': '1.0', ' globals ': [], 'X': array([[1.9643 , 4.5957 ], [2.2753
, 3.8589 ], [2.9781 , 4.5651 ], [2.932 , 3.5519 ], [3.5772 , 2.856 ], [4.015
, 3.1937 ], [3.3814 , 3.4291 ], [3.9113 , 4.1761 ], [2.7822 , 4.0431 ],
[2.5518 , 4.6162 ], [3.3698 , 3.9101 ], [3.1048 , 3.0709 ], [1.9182 , 4.0534 ],
[2.2638 , 4.3706 ], [2.6555 , 3.5008 ], [3.1855 , 4.2888 ], [3.6579 , 3.8692 ],
[3.9113 , 3.4291 ], [3.6002 , 3.1221 ], [3.0357 , 3.3165 ], [1.5841 , 3.3575 ],
[2.0103 , 3.2039 ], [1.9527 , 2.7843 ], [2.2753 , 2.7127 ], [2.3099 , 2.9584 ],
[2.8283 , 2.6309 ], [3.0473 , 2.2931 ], [2.4827 , 2.0373 ], [2.5057 , 2.3853 ],
[1.8721 , 2.0577 ], [2.0103 , 2.3546 ], [1.2269 , 2.3239 ], [1.8951 , 2.9174 ],
[1.561 , 3.0709 ], [1.5495 , 2.6923 ], [1.6878 , 2.4057 ], [1.4919 , 2.0271 ],
[0.962 , 2.682 ], [1.1693 , 2.9276 ], [0.8122 , 2.9992 ], [0.9735 , 3.3881 ],
[1.25 , 3.1937 ], [1.3191 , 3.5109 ], [2.2292 , 2.201 ], [2.4482 , 2.6411 ],
[2.7938 , 1.9656 ], [2.091 , 1.6177 ], [2.5403 , 2.8867 ], [0.9044 , 3.0198 ],
[0.76615 , 2.5899 ], [0.086405, 4.1045 ]]), 'y': array([[1],
'y': array([[1], [1], [1], [1], [1], [1], [1], [1], [1], [1],
[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [0],
[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0],
[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0],
[0], [0], [0], [0], [1]], dtype=uint8)}
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1,
loss='hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
verbose=0)
0.9803921568627451
1.0
{' header ': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Nov 13
14:27:25 2011', ' version ': '1.0', ' globals ': [], 'X': array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0,
..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 1, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8), 'y': array([[1], [1], [0], ..., [1], [0],