ML Lab....... 3-Converted New

Download as pdf or txt
Download as pdf or txt
You are on page 1of 27

PROGRAMS:

2: Write a program to implement multivariate linear regression

Sourcecode:
import numpy as np
import matplotlib.pyplot as plt
class GradientDescent():
def init (self, alpha=0.1, tolerance=0.02, max_iterations=500):
self._alpha = alpha
self._tolerance = tolerance
self._max_iterations = max_iterations
self._thetas = None
def fit(self, xs, ys):
num_examples, num_features = np.shape(xs)
self._thetas = np.ones(num_features)
xs_transposed = xs.transpose()
for i in range(self._max_iterations):
diffs = np.dot(xs,self._thetas) - ys
cost = np.sum(diffs**2) / (2*num_examples)
gradient = np.dot(xs_transposed, diffs) / num_examples
self._thetas = self._thetas-self._alpha*gradient
if cost < self._tolerance:
return self._thetas
return self._thetas
def predict(self, x):
return np.dot(x, self._thetas)
data = np.loadtxt("iris.data.txt", usecols=(0,1,2,3), delimiter=',')
col_names = ['sepal length', 'sepal width', 'petal length', 'petal width']
data_map = dict(zip(col_names, data.transpose()))
features = np.column_stack((data_map['petal length'], np.ones(len(data_map['petal length']))))
gd = GradientDescent(tolerance=0.022)
thetas = gd.fit(features, data_map['petal width'])
gradient, intercept = thetas
ys = gd.predict(features)
plt.scatter(data_map['petal length'], data_map['petal width'])
plt.plot(data_map['petal length'], data_map['petal width'])
plt.show()
output:

3 (a):Write a program to implement simple logistic regression

Sourcecode:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import os
path = os.getcwd() + '\data\ex2data1.txt'
data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
data.head()

positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')

def sigmoid(z):
return 1 / (1 + np.exp(-z))
nums = np.arange(-10, 10, step=1)
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(nums, sigmoid(nums), 'r')

def cost(theta, X, y):


theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
return np.sum(first - second) / (len(X))
data.insert(0, 'Ones', 1)
cols = data.shape[1]
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]
X = np.array(X.values)
y = np.array(y.values)
theta = np.zeros(3)

X.shape, theta.shape, y.shape

cost(theta, X, y)

def gradient(theta, X, y):


theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:,i])
grad[i] = np.sum(term) / len(X)
return grad

import scipy.optimize as opt


result = opt.fmin_tnc(func=cost, x0=theta, fprime=gradient, args=(X, y))
cost(result[0], X, y)

def predict(theta, X):


probability = sigmoid(X * theta.T)
return [1 if x >= 0.5 else 0 for x in probability]
theta_min = np.matrix(result[0])
predictions = predict(theta_min, X)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y
)]
accuracy = (sum(map(int, correct)) % len(correct))
print ('accuracy = {0}%'.format(accuracy))
output:

Exam 1 Exam 2 Admitt


ed
0 34.6236 78.0246 0
60 93
1 30.2867 43.8949 0
11 98
2 35.8474 72.9021 0
09 98
3 60.1825 86.3085 1
99 52
4 79.0327 75.3443 1
36 76

3(b): Write a program to implement multivariate logistics regression

Sourcecode:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
%matplotlib inline
data = loadmat('data/ex3data1.mat')
data

data['X'].shape, data['y'].shape

def sigmoid(z):
return 1 / (1 + np.exp(-z))
def cost(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
reg = (learningRate / 2 * len(X)) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
return np.sum(first - second) / (len(X)) + reg
def gradient_with_loop(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
grad = np.zeros(parameters)
error = sigmoid(X * theta.T) - y
for i in range(parameters):
term = np.multiply(error, X[:,i])
if (i == 0):
grad[i] = np.sum(term) / len(X)
else:
grad[i] = (np.sum(term) / len(X)) + ((learningRate / len(X)) * theta[:,i])
return grad
def gradient(theta, X, y, learningRate):
theta = np.matrix(theta)
X = np.matrix(X)
y = np.matrix(y)
parameters = int(theta.ravel().shape[1])
error = sigmoid(X * theta.T) - y
grad = ((X.T * error) / len(X)).T + ((learningRate / len(X)) * theta)
grad[0, 0] = np.sum(np.multiply(error, X[:,0])) / len(X)
return np.array(grad).ravel()
from scipy.optimize import minimize
def one_vs_all(X, y, num_labels, learning_rate):
rows = X.shape[0]
params = X.shape[1]
all_theta = np.zeros((num_labels, params + 1))
X = np.insert(X, 0, values=np.ones(rows), axis=1)
for i in range(1, num_labels + 1):
theta = np.zeros(params + 1)
y_i = np.array([1 if label == i else 0 for label in y])
y_i = np.reshape(y_i, (rows, 1))
fmin = minimize(fun=cost, x0=theta, args=(X, y_i, learning_rate), method='TNC',
jac=gradient)
all_theta[i-1,:] = fmin.x
return all_theta
rows = data['X'].shape[0]
params = data['X'].shape[1]
all_theta = np.zeros((10, params + 1))
X = np.insert(data['X'], 0, values=np.ones(rows), axis=1)
theta = np.zeros(params + 1)
y_0 = np.array([1 if label == 0 else 0 for label in data['y']])
y_0 = np.reshape(y_0, (rows, 1))
X.shape, y_0.shape, theta.shape, all_theta.shape

np.unique(data['y'])
np.unique(data['y'])
all_theta = one_vs_all(data['X'], data['y'], 10, 1)
all_theta
def predict_all(X, all_theta):
rows = X.shape[0]
params = X.shape[1]
num_labels = all_theta.shape[0]
X = np.insert(X, 0, values=np.ones(rows), axis=1)
X = np.matrix(X)
all_theta = np.matrix(all_theta)
h = sigmoid(X * all_theta.T)
h_argmax = np.argmax(h, axis=1)
h_argmax = h_argmax + 1
return h_argmax
y_pred = predict_all(data['X'], all_theta)
correct = [1 if a == b else 0 for (a, b) in zip(y_pred, data['y'])]
accuracy = (sum(map(int, correct)) / float(len(correct)))
print ('accuracy = {0}%'.format(accuracy * 100))
accuracy = 74.6%

output:
{' header ': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Oct 16
13:09:09 2011',

' version ': '1.0',


' globals ': [],
'X': array([[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
...,
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.],
[0., 0., 0., ..., 0., 0., 0.]]),
'y': array([[10],
[10],
[10],
...,
[ 9],
[ 9],
[ 9]], dtype=uint8)}

array([[-3.70247923e-05, 0.00000000e+00, 0.00000000e+00, ...,


-2.24803603e-10, 2.31962906e-11, 0.00000000e+00],
[-8.96250745e-05, 0.00000000e+00, 0.00000000e+00, ...,
7.26120886e-09, -6.19965350e-10, 0.00000000e+00],
[-8.39553309e-05, 0.00000000e+00, 0.00000000e+00, ...,
-7.61695539e-10, 4.64917610e-11, 0.00000000e+00],
...,
[-7.00832394e-05, 0.00000000e+00, 0.00000000e+00, ...,
-6.92008993e-10, 4.29241468e-11, 0.00000000e+00],
[-7.65187921e-05, 0.00000000e+00, 0.00000000e+00, ...,
-8.09503259e-10, 5.31058709e-11, 0.00000000e+00],
[-6.63412370e-05, 0.00000000e+00, 0.00000000e+00, ...,
-3.49765862e-09, 1.13668519e-10, 0.00000000e+00]])

5:Write a program to implement - Classification Algorithm

Sourcecode:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
fruits = pd.read_table('fruit_data_with_colors.txt')
fruits.head()
print(fruits.shape)
print(fruits['fruit_name'].unique())
print(fruits.groupby('fruit_name').size())
import seaborn as sns
sns.countplot(fruits['fruit_name'],label="Count")
plt.show()
fruits.drop('fruit_label', axis=1).plot(kind='box', subplots=True, layout=(2,2), sharex=False, shar
ey=False, figsize=(9,9),
title='Box Plot for each input variable')
plt.savefig('fruits_box')
plt.show()
import pylab as pl
fruits.drop('fruit_label' ,axis=1).hist(bins=30, figsize=(9,9))
pl.suptitle("Histogram for each numeric input variable")
plt.savefig('fruits_hist')
plt.show()
import warnings
warnings.filterwarnings("ignore")
from pandas.tools.plotting import scatter_matrix
from matplotlib import cm
feature_names = ['mass', 'width', 'height', 'color_score']
X = fruits[feature_names]
y = fruits['fruit_label']
cmap = cm.get_cmap('gnuplot')
scatter = pd.scatter_matrix(X, c = y, marker = 'o', s=40, hist_kwds={'bins':15}, figsize=(9,9),
cmap = cmap)
plt.suptitle('Scatter-matrix for each input variable')
plt.savefig('fruits_scatter_matrix')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
.format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
.format(logreg.score(X_test, y_test)))
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
.format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
.format(clf.score(X_test, y_test)))
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
.format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
.format(knn.score(X_test, y_test)))
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
.format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
.format(svm.score(X_test, y_test)))
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = knn.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

output:
fruit_label fruit_name fruit_subtype mass width height color_score
1 apple granny_smith 192 8.4 7.3 0.55
1 apple granny_smith 180 8.0 6.8 0.59
1 apple granny_smith 176 7.4 7.2 0.60
2 mandarin mandarin 86 6.2 4.7 0.80
2 mandarin mandarin 84 6.0 4.6 0.79
Accuracy of K-NN classifier on training set: 0.95
Accuracy of K-NN classifier on test set: 1.00
Accuracy of SVM classifier on training set: 0.61
Accuracy of SVM classifier on test set: 0.33
[[4 0 0 0]
[0 1 0 0]
[0 0 8 0]
[0 0 0 2]]
precision recall f1-score support
1 1.00 1.00 1.00 4
2 1.00 1.00 1.00 1
3 1.00 1.00 1.00 8 4 1.00 1.00 1.00 2 avg / total 1.00 1.00 1.00 15
7: Write a program to implement K-means Clustering Example

Sourcecode:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
%matplotlib inline
def find_closest_centroids(X, centroids):
m = X.shape[0]
k = centroids.shape[0]
idx = np.zeros(m)
for i in range(m):
min_dist = 1000000
for j in range(k):
dist = np.sum((X[i,:] - centroids[j,:]) ** 2)
if dist < min_dist:
min_dist = dist
idx[i] = j
return idx
data = loadmat('data/ex7data2.mat')
X = data['X']
initial_centroids = initial_centroids = np.array([[3, 3], [6, 2], [8, 5]])
idx = find_closest_centroids(X, initial_centroids)
idx[0:3]
def compute_centroids(X, idx, k):
m, n = X.shape
centroids = np.zeros((k, n))
for i in range(k):
indices = np.where(idx == i)
centroids[i,:] = (np.sum(X[indices,:], axis=1) / len(indices[0])).ravel()
return centroids
compute_centroids(X, idx, 3)
def run_k_means(X, initial_centroids, max_iters):
m, n = X.shape
k = initial_centroids.shape[0]
idx = np.zeros(m)
centroids = initial_centroids
for i in range(max_iters):
for i in range(max_iters):
idx = find_closest_centroids(X, centroids)
centroids = compute_centroids(X, idx, k)
return idx, centroids
idx, centroids = run_k_means(X, initial_centroids, 10)
cluster1 = X[np.where(idx == 0)[0],:]
cluster2 = X[np.where(idx == 1)[0],:]
cluster3 = X[np.where(idx == 2)[0],:]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(cluster1[:,0], cluster1[:,1], s=30, color='r', label='Cluster 1')
ax.scatter(cluster2[:,0], cluster2[:,1], s=30, color='g', label='Cluster 2')
ax.scatter(cluster3[:,0], cluster3[:,1], s=30, color='b', label='Cluster 3')
ax.legend()
def init_centroids(X, k):
m, n = X.shape
centroids = np.zeros((k, n))
idx = np.random.randint(0, m, k)
for i in range(k):
centroids[i,:] = X[idx[i],:]
return centroids
init_centroids(X, 3)
image_data = loadmat('data/bird_small.mat')
image_data
A = image_data['A']
A.shape
A = A / 255.
# reshape the array
X = np.reshape(A, (A.shape[0] * A.shape[1], A.shape[2]))
# randomly initialize the centroids
initial_centroids = init_centroids(X, 16)
idx, centroids = run_k_means(X, initial_centroids, 10)
# get the closest centroids one last time
idx = find_closest_centroids(X, centroids)
# map each pixel to the centroid value
X_recovered = centroids[idx.astype(int),:]
# reshape to the original dimensions
X_recovered = np.reshape(X_recovered, (A.shape[0], A.shape[1], A.shape[2]))
plt.imshow(X_recovered)

output:
array([0., 2., 1.])

array ([[2.42830111, 3.15792418], [5.81350331, 2.63365645], [7.11938687, 3.6166844


]])
Array ([[ 3.27844295, 1.75043926], [ 1.84207953, 4.6075716 ], [-0.24512713,
5.74019237]])

{' header ': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Tue Jun 5 04:06:24
2012', ' version ': '1.0',

' globals ': [],

'A': array ([[[219, 180, 103],

[230, 185, 116],

[226, 186, 110],

...,

[ 14, 15, 13],

[ 13, 15, 12],

[ 12, 14, 12]],

[[230, 193, 119],

[224, 192, 120],


[226, 192, 124],

...,

[ 16, 16, 13],

[ 14, 15, 10],

[ 11, 14, 9]],

[[228, 191, 123],

[228, 191, 121],

[220, 185, 118],

...,

[ 14, 16, 13],

[ 13, 13, 11],

[ 11, 15, 10]],

...,

[[ 15, 18, 16],

[ 18, 21, 18],

[ 18, 19, 16],

...,

[ 81, 45, 45],

[ 70, 43, 35],

[ 72, 51, 43]],

[[ 16, 17, 17],

[ 17, 18, 19],

[ 20, 19, 20],

...,

[ 80, 38, 40],


[ 68, 39, 40],

[ 59, 43, 42]],

[[ 15, 19, 19],

[ 20, 20, 18],

[ 18, 19, 17],

...,

[ 65, 43, 39],

[ 58, 37, 38],

[ 52, 39, 34]]],

dtype=uint8)}

(128, 128, 3)
8 : Write a program to Implement KNN algorithms with example

Sourcecode:
# Importing libraries
import pandas as pd
import numpy as np
import math
import operator
data = pd.read_csv("iris.csv")
data.head()
def euclideanDistance(data1, data2, length):
distance = 0
for x in range(length):
distance += np.square(data1[x] - data2[x])
return np.sqrt(distance)
def knn(trainingSet, testInstance, k):
distances = {}
sort = {}
length = testInstance.shape[1]
for x in range(len(trainingSet)):
dist = euclideanDistance(testInstance, trainingSet.iloc[x], length)
distances[x] = dist[0]
sorted_d = sorted(distances.items(), key=operator.itemgetter(1))
neighbors = []
for x in range(k):
neighbors.append(sorted_d[x][0])
classVotes = {}
for x in range(len(neighbors)):
response = trainingSet.iloc[neighbors[x]][-1]
if response in classVotes:
classVotes[response] += 1
else:
classVotes[response] = 1
sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
return(sortedVotes[0][0], neighbors)
testSet = [[7.2, 3.6, 5.1, 2.5]]
test = pd.DataFrame(testSet)
k=1
result,neigh = knn(data, test, k)
print(result)
print(neigh)
k=3
result,neigh = knn(data, test, k)
print(result)
print(neigh)
k=5
result,neigh = knn(data, test, k)
print(result)
In [12]:
print(neigh)
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(data.iloc[:,0:4], data['Name'])
print(neigh.predict(test))
print(neigh.kneighbors(test)[1])

output:

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',


metric_params=None, n_jobs=1, n_neighbors=3, p=2, weights='uniform')
9: Write a program to implement SVM Example

Sourcecode:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
%matplotlib inline
raw_data = loadmat('data/ex6data1.mat')
raw_data
data = pd.DataFrame(raw_data['X'], columns=['X1', 'X2'])
data['y'] = raw_data['y']
positive = data[data['y'].isin([1])]
negative = data[data['y'].isin([0])]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['X1'], positive['X2'], s=50, marker='x', label='Positive')
ax.scatter(negative['X1'], negative['X2'], s=50, marker='o', label='Negative')
ax.legend()
from sklearn import svm
svc = svm.LinearSVC(C=1, loss='hinge', max_iter=1000)
svc
svc.fit(data[['X1', 'X2']], data['y'])
svc.score(data[['X1', 'X2']], data['y'])
svc2 = svm.LinearSVC(C=100, loss='hinge', max_iter=1000)
svc2.fit(data[['X1', 'X2']], data['y'])
svc2.score(data[['X1', 'X2']], data['y'])
data['SVM 1 Confidence'] = svc.decision_function(data[['X1', 'X2']])
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(data['X1'], data['X2'], s=50, c=data['SVM 1 Confidence'], cmap='seismic')
ax.set_title('SVM (C=1) Decision Confidence')
data['SVM 2 Confidence'] = svc2.decision_function(data[['X1', 'X2']])
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(data['X1'], data['X2'], s=50, c=data['SVM 2 Confidence'], cmap='seismic')
ax.set_title('SVM (C=100) Decision Confidence')
def gaussian_kernel(x1, x2, sigma):
return np.exp(-(np.sum((x1 - x2) ** 2) / (2 * (sigma ** 2))))
x1 = np.array([1.0, 2.0, 1.0])
x2 = np.array([0.0, 4.0, -1.0])
sigma = 2
gaussian_kernel(x1, x2, sigma)
raw_data = loadmat('data/ex6data2.mat')
data = pd.DataFrame(raw_data['X'], columns=['X1', 'X2'])
data['y'] = raw_data['y']
positive = data[data['y'].isin([1])]
negative = data[data['y'].isin([0])]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['X1'], positive['X2'], s=30, marker='x', label='Positive')
ax.scatter(negative['X1'], negative['X2'], s=30, marker='o', label='Negative')
ax.legend()
svc = svm.SVC(C=100, gamma=10, probability=True)
svc.fit(data[['X1', 'X2']], data['y'])
data['Probability'] = svc.predict_proba(data[['X1', 'X2']])[:,0]
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(data['X1'], data['X2'], s=30, c=data['Probability'], cmap='Reds')
raw_data = loadmat('data/ex6data3.mat')
X = raw_data['X']
Xval = raw_data['Xval']
y = raw_data['y'].ravel()
yval = raw_data['yval'].ravel()
C_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
gamma_values = [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
best_score = 0
best_params = {'C': None, 'gamma': None}
for C in C_values:
for gamma in gamma_values:
svc = svm.SVC(C=C, gamma=gamma)
svc.fit(X, y)
score = svc.score(Xval, yval)
if score > best_score:
best_score = score
best_params['C'] = C
best_params['gamma'] = gamma
best_score, best_params
spam_train = loadmat('data/spamTrain.mat')
spam_test = loadmat('data/spamTest.mat')
spam_train
X = spam_train['X']
Xtest = spam_test['Xtest']
y = spam_train['y'].ravel()
ytest = spam_test['ytest'].ravel()
X.shape, y.shape, Xtest.shape, ytest.shape
svc = svm.SVC()
svc.fit(X, y)
print('Test accuracy = {0}%'.format(np.round(svc.score(Xtest, ytest) * 100, 2)))
output:
{' header ': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Nov 13
14:28:43 2011', ' version ': '1.0', ' globals ': [], 'X': array([[1.9643 , 4.5957 ], [2.2753
, 3.8589 ], [2.9781 , 4.5651 ], [2.932 , 3.5519 ], [3.5772 , 2.856 ], [4.015
, 3.1937 ], [3.3814 , 3.4291 ], [3.9113 , 4.1761 ], [2.7822 , 4.0431 ],
[2.5518 , 4.6162 ], [3.3698 , 3.9101 ], [3.1048 , 3.0709 ], [1.9182 , 4.0534 ],
[2.2638 , 4.3706 ], [2.6555 , 3.5008 ], [3.1855 , 4.2888 ], [3.6579 , 3.8692 ],
[3.9113 , 3.4291 ], [3.6002 , 3.1221 ], [3.0357 , 3.3165 ], [1.5841 , 3.3575 ],
[2.0103 , 3.2039 ], [1.9527 , 2.7843 ], [2.2753 , 2.7127 ], [2.3099 , 2.9584 ],
[2.8283 , 2.6309 ], [3.0473 , 2.2931 ], [2.4827 , 2.0373 ], [2.5057 , 2.3853 ],
[1.8721 , 2.0577 ], [2.0103 , 2.3546 ], [1.2269 , 2.3239 ], [1.8951 , 2.9174 ],
[1.561 , 3.0709 ], [1.5495 , 2.6923 ], [1.6878 , 2.4057 ], [1.4919 , 2.0271 ],
[0.962 , 2.682 ], [1.1693 , 2.9276 ], [0.8122 , 2.9992 ], [0.9735 , 3.3881 ],
[1.25 , 3.1937 ], [1.3191 , 3.5109 ], [2.2292 , 2.201 ], [2.4482 , 2.6411 ],
[2.7938 , 1.9656 ], [2.091 , 1.6177 ], [2.5403 , 2.8867 ], [0.9044 , 3.0198 ],
[0.76615 , 2.5899 ], [0.086405, 4.1045 ]]), 'y': array([[1],

In [3]: data = pd.DataFrame(raw_data['X'], columns=['X1', 'X2']) data['y'] = raw_data['y']


positive = data[data['y'].isin([1])] negative = data[data['y'].isin([0])] fig, ax =
plt.subplots(figsize=(12,8)) ax.scatter(positive['X1'], positive['X2'], s=50, marker='x',
label='Positive') ax.scatter(negative['X1'], negative['X2'], s=50, marker='o', label='Negative')
ax.legend()

'y': array([[1], [1], [1], [1], [1], [1], [1], [1], [1], [1],
[1], [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [0],
[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0],
[0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0], [0],
[0], [0], [0], [0], [1]], dtype=uint8)}
LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1,
loss='hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
verbose=0)

0.9803921568627451

1.0

Text(0.5,1,'SVM (C=1) Decision Confidence')


Text(0.5,1,'SVM (C=100) Decision Confidence')
(0.965, {'C': 0.3, 'gamma': 100})

{' header ': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Sun Nov 13
14:27:25 2011', ' version ': '1.0', ' globals ': [], 'X': array([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0,
..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 0, 0, 0], [0, 0, 1, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8), 'y': array([[1], [1], [0], ..., [1], [0],

In [15]: X = spam_train['X'] Xtest = spam_test['Xtest'] y = spam_train['y'].ravel() ytest =


spam_test['ytest'].ravel() X.shape, y.shape, Xtest.shape, ytest.shape

In [16]: svc = svm.SVC() svc.fit(X, y) print('Test accuracy =


{0}%'.format(np.round(svc.score(Xtest, ytest) * 100, 2)))

[0], [0]], dtype=uint8)}

((4000, 1899), (4000,), (1000, 1899), (1000,))

Test accuracy = 95.3%

You might also like