Naive Bayes
Naive Bayes
# comparing actual response values (y_test) with predicted response values (y_pred)
from sklearn import metrics
print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test,
y_pred)*100)
--------------------------------------------------------------------
linear regression
import numpy as np
import matplotlib.pyplot as plt
# putting labels
plt.xlabel('x')
plt.ylabel('y')
def main():
# observations / data
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
# estimating coefficients
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {}\
\nb_1 = {}".format(b
-----------------------------------
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+",
skiprows=22, header=None)
X = np.hstack([raw_df.values[::2, :],
raw_df.values[1::2, :2]])
y = raw_df.values[1::2, 2]
X_train, X_test,\
y_train, y_test = train_test_split(X, y,
test_size=0.4,
random_state=1)
reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
# regression coefficients
print('Coefficients: ', reg.coef_)
# plotting legend
plt.legend(loc='upper right')
# plot title
plt.title("Residual errors")
---------------------------
polynomial
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import warnings
df = pd.read_csv('Position_Salaries.csv')
X = df.iloc[:,1:2].values
y = df.iloc[:,2].values
x
poly_reg3=PolynomialFeatures(degree=3)
X_poly3=poly_reg3.fit_transform(X)
lin_reg_3=LinearRegression()
lin_reg_3.fit(X_poly3,y)
plt.scatter(X,y,color='red')
plt.plot(X,lin_reg.predict(X),color='green')
plt.title('Simple Linear Regression')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()
plt.style.use('fivethirtyeight')
plt.scatter(X,y,color='red')
plt.plot(X,lin_reg_2.predict(poly_reg2.fit_transform(X)),color='green')
plt.plot(X,lin_reg_3.predict(poly_reg3.fit_transform(X)),color='yellow')
plt.title('Polynomial Linear Regression Degree 2')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()
plt.style.use('fivethirtyeight')
X_grid=np.arange(min(X),max(X),0.1) # This will give us a vector.We will have to
convert this into a matrix
X_grid=X_grid.reshape((len(X_grid),1))
plt.scatter(X,y,color='red')
plt.plot(X_grid,lin_reg_3.predict(poly_reg3.fit_transform(X_grid)),color='lightgree
n')
#plt.plot(X,lin_reg_3.predict(poly_reg3.fit_transform(X)),color='green')
plt.title('Polynomial Linear Regression Degree 3')
plt.xlabel('Position Level')
plt.ylabel('Salary')
plt.show()
--------------------------------------------------
KNN
import math
def classifyAPoint(points,p,k=3):
'''
This function finds the classification of p using
k nearest neighbor algorithm. It assumes only two
groups and returns 0 if p belongs to group 0, else
1 (belongs to group 1).
Parameters -
points: Dictionary of training points having two keys - 0 and 1
Each key have a list of training data points belong to that
distance=[]
for group in points:
for feature in points[group]:
for d in distance:
if d[1] == 0:
freq1 += 1
elif d[1] == 1:
freq2 += 1
# driver function
def main():
points = {0:[(1,12),(2,5),(3,6),(3,10),(3.5,8),(2,11),(2,9),(1,7)],
1:[(5,3),(3,2),(1.5,9),(7,2),(6,1),(3.8,1),(5.6,4),(4,2),(2,5)]}
# Number of neighbours
k = 3
if __name__ == '__main__':
main()
--------------------------------------------------------------
# Importing libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
class ElasticRegression() :
self.learning_rate = learning_rate
self.iterations = iterations
self.l1_penality = l1_penality
self.l2_penality = l2_penality
# no_of_training_examples, no_of_features
# weight initialization
self.b = 0
self.X = X
self.Y = Y
self.update_weights()
return self
# calculate gradients
dW = np.zeros( self.n )
if self.W[j] > 0 :
self.l1_penality + 2 * self.l2_penality *
self.W[j] ) / self.m
else :
- self.l1_penality + 2 * self.l2_penality *
self.W[j] ) / self.m
# update weights
return self
# Hypothetical function h( x )
# Driver Code
def main() :
# Importing dataset
df = pd.read_csv( "salary_data.csv" )
X = df.iloc[:,:-1].values
Y = df.iloc[:,1].values
test_size = 1/3,
random_state = 0 )
# Model training
plt.ylabel( 'Salary' )
plt.show()
if __name__ == "__main__" :
main()
----------------------------------------------------------------------------------
SVM
# Scatter plot
plt.scatter(X[:, 0], X[:, 1],
c=y,
s=20, edgecolors="k")
plt.show()
---------------------------------------------------------------------------
SVM------2
# plotting scatter
plt.scatter(X[:, 0], X[:, 1], c=Y, s=50, cmap='spring')
plt.xlim(-1, 3.5);
plt.show()
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
print (x),(y)
clf.predict([[120, 990]])
clf.predict([[85, 550]])
--------------------------------------------------------------------------
DECISION TREE
return balance_data
# Performing training
clf_gini.fit(X_train, y_train)
return clf_gini
# Performing training
clf_entropy.fit(X_train, y_train)
return clf_entropy
if __name__ == "__main__":
data = importdata()
X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
# Operational Phase
print("Results Using Gini Index:")
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)
print("Results Using Entropy:")
y_pred_entropy = prediction(X_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)
-----------------------------------------------------------------------------
---------------------------------
Decision Tree Regression USING SKLEARN
# import dataset
# dataset = pd.read_csv('Data.csv')
# alternatively open up .csv file to read data
dataset = np.array(
[['Asset Flip', 100, 1000],
['Text Based', 500, 3000],
['Visual Novel', 1500, 5000],
['2D Pixel Art', 3500, 8000],
['2D Vector Art', 5000, 6500],
['Strategy', 6000, 7000],
['First Person Shooter', 8000, 15000],
['Simulator', 9500, 20000],
['Racing', 12000, 21000],
['RPG', 14000, 25000],
['Sandbox', 15500, 27000],
['Open-World', 16500, 30000],
['MMOFPS', 25000, 52000],
['MMORPG', 30000, 80000]
])
# print X
print(X)
# print y
print(y)
# specify title
plt.title('Profit to Production Cost (Decision Tree Regression)')
# import export_graphviz
from sklearn.tree import export_graphviz
------------------------------------------------------------------------------
K-MEAN CLUSTERING
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
X,y = make_blobs(n_samples = 500,n_features = 2,centers = 3,random_state = 23)
fig = plt.figure(0)
plt.grid(True)
plt.scatter(X[:,0],X[:,1])
plt.show()
k = 3
clusters = {}
np.random.seed(23)
clusters[idx] = cluster
clusters
plt.scatter(X[:,0],X[:,1])
plt.grid(True)
for i in clusters:
center = clusters[i]['center']
plt.scatter(center[0],center[1],marker = '*',c = 'red')
plt.show()
def distance(p1,p2):
return np.sqrt(np.sum((p1-p2)**2))
#Implementing E step
def assign_clusters(X, clusters):
for idx in range(X.shape[0]):
dist = []
curr_x = X[idx]
for i in range(k):
dis = distance(curr_x,clusters[i]['center'])
dist.append(dis)
curr_cluster = np.argmin(dist)
clusters[curr_cluster]['points'].append(curr_x)
return clusters
clusters[i]['points'] = []
return clusters
clusters = assign_clusters(X,clusters)
clusters = update_clusters(X,clusters)
pred = pred_cluster(X,clusters)
plt.scatter(X[:,0],X[:,1],c = pred)
for i in clusters:
center = clusters[i]['center']
plt.scatter(center[0],center[1],marker = '^',c = 'red')
plt.show()
-----------------------------------
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
X, y = load_iris(return_X_y=True)
sns.set_style("whitegrid")
g=sns.lineplot(x=range(1,11), y=sse)
plt.show()
pred = kmeans.fit_predict(X)
pred
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.scatter(X[:,0],X[:,1],c = pred, cmap=cm.Accent)
plt.grid(True)
for center in kmeans.cluster_centers_:
center = center[:2]
plt.scatter(center[0],center[1],marker = '^',c = 'red')
plt.xlabel("petal length (cm)")
plt.ylabel("petal width (cm)")
plt.subplot(1,2,2)
plt.scatter(X[:,2],X[:,3],c = pred, cmap=cm.Accent)
plt.grid(True)
for center in kmeans.cluster_centers_:
center = center[2:4]
plt.scatter(center[0],center[1],marker = '^',c = 'red')
plt.xlabel("sepal length (cm)")
plt.ylabel("sepal width (cm)")
plt.show()
-------------------------------------------------------------------
Customer Segmentation using Unsupervised Machine Learning in Python
by k mean clustering
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
df = pd.read_csv('new.csv')
df.head()
df.shape
df.info()
df.describe().T
df = df.dropna()
print("Total missing values are:", len(df))
df.nunique()
print(objects)
print(floats)
plt.subplots(figsize=(15, 10))
for i, col in enumerate(objects):
plt.subplot(2, 2, i + 1)
sb.countplot(df[col])
plt.show()
df['Marital_Status'].value_counts()
scaler = StandardScaler()
data = scaler.fit_transform(df)
error = []
for n_clusters in range(1, 21):
model = KMeans(init='k-means++',
n_clusters=n_clusters,
max_iter=500,
random_state=22)
model.fit(df)
error.append(model.inertia_)
plt.figure(figsize=(10, 5))
sb.lineplot(x=range(1, 21), y=error)
sb.scatterplot(x=range(1, 21), y=error)
plt.show()
plt.figure(figsize=(7, 7))
sb.scatterplot(tsne_data[:, 0], tsne_data[:, 1], hue=segments)
plt.show()
-------------------------------------------------------------
# importing dependencies
import numpy as np
import matplotlib.pyplot as plt
# creating data
mean = np.array([5.0, 6.0])
cov = np.array([[1.0, 0.95], [0.95, 1.2]])
data = np.random.multivariate_normal(mean, cov, 8000)
# visualising data
plt.scatter(data[:500, 0], data[:500, 1], marker='.')
plt.show()
# train-test-split
data = np.hstack((np.ones((data.shape[0], 1)), data))
split_factor = 0.90
split = int(split_factor * data.shape[0])
print(& quot
Number of examples in training set= % d & quot
% (X_train.shape[0]))
print(& quot
Number of examples in testing set= % d & quot
% (X_test.shape[0]))
------------------------------------------
import pandas as pd
import numpy as np
# Creating a DataFrame
df = pd.DataFrame(data, columns=['Feature1', 'Feature2'])
# Save to CSV
df.to_csv('synthetic_data.csv', index=False)
import numpy as np
for _ in range(max_iters):
# Assign each data point to the closest centroid
distances = np.linalg.norm(data - centroids[:, np.newaxis], axis=2)
labels = np.argmin(distances, axis=0)
# Update centroids
new_centroids = np.array([data[labels == j].mean(axis=0) for j in
range(k)])
centroids = new_centroids
import numpy as np
import matplotlib.pyplot as plt
return k_optimal
plt.title('K-Means Clustering')
plt.xlabel('Feature1')
plt.ylabel('Feature2')
plt.show()
-----------------------------------------------------------------------------------
----------------------------------------------------------------------------------
Week1:Write a python program to import and export the data using pandas library
1. Manual Function
def load_csv(filepath):
data = []
col = []
checkcol = False
with open(filepath) as f:
for val in f.readlines():
val = val.replace("\n","")
val = val.split(',')
if checkcol is False:
col = val
checkcol = True
else:
data.append(val)
df = pd.DataFrame(data=data, columns=col)
return df
2. Numpy.loadtxt function
df = np.loadtxt('convertcsv.csv', delimeter = ',')
print(df[:5,:])
3. Numpy.genfromtxt()
data = np.genfromtxt('100 Sales Records.csv', delimiter=',')
>>> pd.DataFrame(data)
4. Pandas.read_csv()
>>> pdDf = pd.read_csv('100 Sales Record.csv')
>>> pdDf.head()
5. Pickle
with open('test.pkl','wb') as f:
pickle.dump(pdDf, f)
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
--
WEEK-2: Data preprocessing
# filtering data
# displaying data only with Gender = Not NaN
data[bool_series]
# importing pandas as pd
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
# creating a dataframe from dictionary
df = pd.DataFrame(dict)
# filling missing value using fillna()
df.fillna(0)
# importing pandas as pd
import pandas as pd
# importing numpy as np
import numpy as np
# dictionary of lists
dict = {'First Score':[100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score':[np.nan, 40, 80, 98]}
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
#import the breast _cancer dataset
from sklearn.datasets import load_breast_cancer
data=load_breast_cancer()
data.keys()
# Check the output classes
print(data['target_names'])
# Check the input attributes
print(data['feature_names'])
# construct a dataframe using pandas
df1=pd.DataFrame(data['data'],columns=data['feature_names'])
# Scale data before applying PCA
scaling=StandardScaler()
# Use fit and transform method
scaling.fit(df1)
Scaled_data=scaling.transform(df1)
# Set the n_components=3
principal=PCA(n_components=3)
principal.fit(Scaled_data)
x=principal.transform(Scaled_data)
-----------------------------------------------------------------
week -4 Write a python program to demonstrate various data visualisation
return probabilities
def predict(info, test):
probabilities = calculateClassProbabilities(info, test)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
def getPredictions(info, test):
predictions = []
for i in range(len(test)):
result = predict(info, test[i])
predictions.append(result)
return predictions
def accuracy_rate(test, predictions):
correct = 0
for i in range(len(test)):
if test[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(test))) * 100.0
filename = r'E:\user\MACHINE LEARNING\machine learning algos\Naive bayes\
filedata.csv'
mydata = csv.reader(open(filename, "rt"))
mydata = list(mydata)
mydata = encode_class(mydata)
for i in range(len(mydata)):
mydata[i] = [float(x) for x in mydata[i]]
ratio = 0.7
----------------------------------------------------------------------------------
import numpy as np
import matplotlib.pyplot as plt
def estimate_coef(x, y):
n = np.size(x)
m_x = np.mean(x)
m_y = np.mean(y)
SS_xy = np.sum(y*x) - n*m_y*m_x
SS_xx = np.sum(x*x) - n*m_x*m_x
b_1 = SS_xy / SS_xx
b_0 = m_y - b_1*m_x
return (b_0, b_1)
def plot_regression_line(x, y, b):
# plotting the actual points as scatter plot
plt.scatter(x, y, color = "m",marker = "o", s = 30)
y_pred = b[0] + b[1]*x
plt.plot(x, y_pred, color = "g")
plt.xlabel('x')
plt.ylabel('y')
plt.show()
def main():
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {}\\nb_1 = {}".format(b[0], b[1]))
plot_regression_line(x, y, b)
if __name__ == "__main__":
-----------------------------------------------------------------------------------
--------------------------------------------------------------------------
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings( "ignore" )
class LogitRegression() :
def __init ( self, learning_rate, iterations ) :
self.learning_rate = learning_rate
self.iterations = iterations
def fit( self, X, Y ) :
self.m, self.n = X.shape
self.W = np.zeros( self.n )
self.b = 0
self.X = X
self.Y = Y
for i in range( self.iterations ) :
self.update_weights()
return self
def update_weights( self ) :
A = 1 / ( 1 + np.exp( - ( self.X.dot( self.W ) + self.b ) ) )
tmp = ( A - self.Y.T )
tmp = np.reshape( tmp, self.m )
dW = np.dot( self.X.T, tmp ) / self.m
db = np.sum( tmp ) / self.m
self.W = self.W - self.learning_rate * dW
self.b = self.b - self.learning_rate * db
return self
def predict( self, X ) :
Z = 1 / ( 1 + np.exp( - ( X.dot( self.W ) + self.b ) ) )
Y = np.where( Z > 0.5, 1, 0 )
return Y
def main() :
df = pd.read_csv( "diabetes.csv" )
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1:].values
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size = 1/3, random_state = 0 )
model = LogitRegression( learning_rate = 0.01, iterations = 1000 )
model.fit( X_train, Y_train )
model1 = LogisticRegression()
model1.fit( X_train, Y_train)
Y_pred = model.predict( X_test )
Y_pred1 = model1.predict( X_test )
correctly_classified = 0
correctly_classified1 = 0
count = 0
for count in range( np.size( Y_pred ) ) :
if Y_test[count] == Y_pred[count] :
correctly_classified = correctly_classified + 1
if Y_test[count] == Y_pred1[count] :
correctly_classified1 = correctly_classified1 + 1
count = count + 1
print( "Accuracy on test set by our model : ", (
correctly_classified / count ) * 100 )
print( "Accuracy on test set by sklearn model : ", (
correctly_classified1 / count ) * 100 )
if __name__ == "__main__" :
main()
# importing pandas package
import pandas as pd
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
--
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
def importdata():
balance_data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-
'+'databases/balance-scale/balance-scale.data',sep= ',', header = None)
print ("Dataset Length: ", len(balance_data))
print ("Dataset Shape: ", balance_data.shape)
print ("Dataset: ",balance_data.head())
return balance_data
def splitdataset(balance_data):
X = balance_data.values[:, 1:5]
Y = balance_data.values[:, 0]
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size = 0.3, random_state = 100)
return X, Y, X_train, X_test, y_train, y_test
def train_using_gini(X_train, X_test, y_train):
clf_gini = DecisionTreeClassifier(criterion = "gini",random_state =
100,max_depth=3,
min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
return clf_gini
def tarin_using_entropy(X_train, X_test, y_train):
clf_entropy = DecisionTreeClassifier(criterion = "entropy", random_state =
100,max_depth = 3,
min_samples_leaf = 5)
clf_entropy.fit(X_train, y_train)
return clf_entropy
def prediction(X_test, clf_object):
y_pred = clf_object.predict(X_test)
print("Predicted values:")
print(y_pred)
return y_pred
def cal_accuracy(y_test, y_pred):
print("Confusion Matrix: ",confusion_matrix(y_test, y_pred))print ("Accuracy :
",accuracy_score(y_test,y_pred)*100)
print("Report : ",
classification_report(y_test, y_pred))
def main():
data = importdata()
X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
clf_gini = train_using_gini(X_train, X_test, y_train)
clf_entropy = tarin_using_entropy(X_train, X_test, y_train)
print("Results Using Gini Index:")
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)
print("Results Using Entropy:")
y_pred_entropy = prediction(X_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)
if __name__=="__main__":
main()
1. Implementation of K-nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import numpy as np
import matplotlib.pyplot as plt
y = irisData.target
-----------------------------------------------------------------------------------
------------------------------------------
WEEK-8
Implementation of Naïve Bayes classifier algorithm
import math
import random
import csv
def encode_class(mydata):
classes = []
for i in range(len(mydata)):
if mydata[i][-1] not in classes:
classes.append(mydata[i][-1])
for i in range(len(classes)):
for j in range(len(mydata)):
if mydata[j][-1] == classes[i]:
mydata[j][-1] = i
return mydata
def splitting(mydata, ratio):
train_num = int(len(mydata) * ratio)
train = []
test = list(mydata)
while len(train) < train_num:
index = random.randrange(len(test))
train.append(test.pop(index))
return train, test
def groupUnderClass(mydata):
dict = {}
for i in range(len(mydata)):
if (mydata[i][-1] not in dict):
dict[mydata[i][-1]] = []
dict[mydata[i][-1]].append(mydata[i])
return dict
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
--
Week-9: Implementation of K-nearest Neighbor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
import numpy as np
import matplotlib.pyplot as plt
y = irisData.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,
random_state=42)
neighbors = np.arange(1, 9)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
for i, k in enumerate(neighbors):
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
train_accuracy[i] = knn.score(X_train, y_train)
test_accuracy[i] = knn.score(X_test, y_test)
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
-------
Let’s first understand the term neural networks. In a neural network, where neurons
are
fed inputs which then neurons consider the weighted sum over them and pass it by an
activation function and passes out the output to next neuron.
-----------------------------------------------------------------------------------
----------------------------------------------------------------------------
WEEK-11
Implementing Random Forest
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = pd.read_csv('Salaries.csv')
print(data)
# Fitting Random Forest Regression to the dataset
# import the regressor
from sklearn.ensemble import RandomForestRegressor
# create regressor object
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
# fit the regressor with x and y data
regressor.fit(x, y)
Y_pred = regressor.predict(np.array([6.5]).reshape(1, 1)) # test the output by
changing values
# Visualising the Random Forest Regression results
# arrange for creating a range of values
# from min value of x to max
# value of x with a difference of 0.01
# between two consecutive values
X_grid = np.arrange(min(x), max(x), 0.01)
# reshape for reshaping the data into a len(X_grid)*1 array,
# i.e. to make a column out of the X_grid value
X_grid = X_grid.reshape((len(X_grid), 1))
# Scatter plot for original data
plt.scatter(x, y, color = 'blue')
# plot predicted data
plt.plot(X_grid, regressor.predict(X_grid),color = 'green')
plt.title('Random Forest Regression')
plt.xlabel('Position level') plt.ylabel('Salary')
------------------------------------------------------
-----------------------------------------------------------------------------------
-----------------------------------------------------------------------------------
--WEEK-12: Unsupervised Learning
Implementing K-means Clustering
def ReadData(fileName):
# Read the file, splitting by lines
f = open(fileName, 'r');
lines = f.read().splitlines();
f.close();
items = [];
for i in range(1, len(lines)):
line = lines[i].split(',');
itemFeatures = [];
for j in range(len(line)-1):
# Convert feature value to float
v = float(line[j]);
# Add feature value to dict
itemFeatures.append(v);
items.append(itemFeatures);
shuffle(items);
return items;
def FindColMinMax(items):n
= len(items[0]);
minima = [sys.maxint for i in range(n)];
maxima = [-sys.maxint -1 for i in range(n)];
for item in items:
for f in range(len(item)):
if (item[f] < minima[f]):
minima[f] = item[f];
if (item[f] > maxima[f]):
maxima[f] = item[f];
return minima,maxima;
-----------------------------------------------------------------------------------
------------------
hypothesis
import math
import csv
def load_csv(filename):
lines=csv.reader(open(filename,"r"));
dataset = list(lines)
headers = dataset.pop(0)
return dataset,headers
class Node:
def init (self,attribute):
self.attribute=attribute
self.children=[]
self.answer=""
def subtables(data,col,delete):
dic={}
coldata=[row[col] for row in data]
attr=list(set(coldata))
counts=[0]*len(attr)
r=len(data)
c=len(data[0])
for x in range(len(attr)):
for y in range(r):
if data[y][col]==attr[x]:
counts[x]+=1
for x in range(len(attr)):
dic[attr[x]]=[[0 for i in range(c)] for j in
range(counts[x])]
pos=0
for y in range(r):
if data[y][col]==attr[x]:
if delete:
del data[y][col]
dic[attr[x]][pos]=data[y]
pos+=1
return attr,dic
def entropy(S):
attr=list(set(S))
if len(attr)==1:
return 0
counts=[0,0]
for i in range(2):
counts[i]=sum([1 for x in S if attr[i]==x])/(len(S)*1.0)
sums=0
for cnt in counts:
sums+=-1*cnt*math.log(cnt,2)
return sums
def compute_gain(data,col):
attr,dic = subtables(data,col,delete=False)
total_size=len(data)
entropies=[0]*len(attr)
ratio=[0]*len(attr)
total_entropy=entropy([row[-1] for row in data])
for x in range(len(attr)):
ratio[x]=len(dic[attr[x]])/(total_size*1.0)
entropies[x]=entropy([row[-1] for row in
dic[attr[x]]])
total_entropy-=ratio[x]*entropies[x]
return total_entropy
def build_tree(data,features):
lastcol=[row[-1] for row in data]
if(len(set(lastcol)))==1:
node=Node("")
node.answer=lastcol[0]
return node
n=len(data[0])-1
gains=[0]*n
for col in range(n):
gains[col]=compute_gain(data,col)
split=gains.index(max(gains))
node=Node(features[split])
fea = features[:split]+features[split+1:]
attr,dic=subtables(data,split,delete=True)
for x in range(len(attr)):
child=build_tree(dic[attr[x]],fea)
node.children.append((attr[x],child))
return node
def print_tree(node,level):
if node.answer!="":
print(" "*level,node.answer)
return
print(" "*level,node.attribute)
for value,n in node.children:
print(" "*(level+1),value)
print_tree(n,level+2)
def classify(node,x_test,features):
if node.answer!="":
print(node.answer)
return
pos=features.index(node.attribute)
for value, n in node.children:
if x_test[pos]==value:
classify(n,x_test,features)
'''Main program'''
dataset,features=load_csv("data3.csv")
node1=build_tree(dataset,features)
print("The decision tree for the dataset using ID3 algorithm
is")
print_tree(node1,0)
testdata,features=load_csv("data3_test.csv")
for xtest in testdata:
print("The test instance:",xtest)
print("The label for test instance:",end=" ")
classify(node1,xtest,features)
-----------------------------------------------------------------------
import pandas as pd
msg=pd.read_csv('naivetext.csv',names=['message','label'])
print('The dimensions of the dataset',msg.shape)
msg['labelnum']=msg.label.map({'pos':1,'neg':0})
X=msg.message
y=msg.labelnum
print(X)
print(y)
#splitting the dataset into train and test data
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X,y)
print ('\n The total number of Training Data :',ytrain.shape)
print ('\n The total number of Test Data :',ytest.shape)
#output of count vectoriser is a sparse matrix
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm=count_vect.transform(xtest)
print('\n The words or Tokens in the text documents \n')
print(count_vect.get_feature_names())
df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_fe
ature_names())
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)
#printing accuracy, Confusion matrix, Precision and Recall
from sklearn import metrics
print('\n Accuracy of the classifer is’,
metrics.accuracy_score(ytest,predicted))
print('\n Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('\n The value of Precision' ,
metrics.precision_score(ytest,predicted))
print('\n The value of Recall' ,
metrics.recall_score(ytest,predicted))
-----------------------------------------------------------------------------------
---------------
10. Apply EM algorithm to cluster a set of data stored in a .CSV file. Use the same
data setfor
clustering using k-Means algorithm. Compare the results of these two algorithms and
comment on the quality of clustering. You can add Python ML library classes/APIin
the
program.
SOURCE CODE:
import matplotlib.pyplot as plt from sklearn
import datasets
from sklearn.cluster import KMeans import
sklearn.metrics as sm
import pandas as pd import numpy as np
iris = datasets.load_iris()
X = pd.DataFrame(iris.data)
X.columns =
['Sepal_Length','Sepal_Width','Petal_Length',
'Petal_Width']
y = pd.DataFrame(iris.target) y.columns =
['Targets']
model = KMeans(n_clusters=3) model.fit(X)
plt.figure(figsize=(14,7))
colormap = np.array(['red', 'lime', 'black'])
# Plot the Original Classifications
plt.subplot(1, 2, 1)
plt.scatter(X.Petal_Length, X.Petal_Width,
c=colormap[y.Targets], s=40) plt.title('Real
Classification')
plt.xlabel('Petal Length') plt.ylabel('Petal
Width')
# Plot the Models Classifications
plt.subplot(1, 2, 2)
plt.scatter(X.Petal_Length, X.Petal_Width,
c=colormap[model.labels_], s=40) plt.title('K
Mean Classification')
plt.xlabel('Petal Length') plt.ylabel('Petal
Width')
print('The accuracy score of K-Mean:
',sm.accuracy_score(y, model.labels_))
print('The Confusion matrixof K-Mean:
',sm.confusion_matrix(y, model.labels_))
from sklearn import preprocessing scaler =
preprocessing.StandardScaler() scaler.fit(X)
xsa = scaler.transform(X)
xs = pd.DataFrame(xsa, columns = X.columns)
#xs.sample(5)
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=3)
gmm.fit(xs)
y_gmm = gmm.predict(xs) #y_cluster_gmm
plt.subplot(2, 2, 3)
plt.scatter(X.Petal_Length, X.Petal_Width,
c=colormap[y_gmm], s=40) plt.title('GMM
Classification')
plt.xlabel('Petal Length') plt.ylabel('Petal
Width')
print('The accuracy score of EM:
',sm.accuracy_score(y, y_gmm)) print('The
Confusion matrix of EM:
',sm.confusion_matrix(y, y_gmm))
-----------------------------------------------------------------------------------
----------
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import datasets
iris=datasets.load_iris()
x = iris.data
y = iris.target
print ('sepal-length', 'sepal-width', 'petal-length', 'petal-width')
print(x)
print('class: 0-Iris-Setosa, 1- Iris-Versicolour, 2- Iris-Virginica')
print(y)
x_train, x_test, y_train, y_test =
train_test_split(x,y,test_size=0.3)
#To Training the model and Nearest nighbors K=5
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(x_train, y_train)
#to make predictions on our test data
y_pred=classifier.predict(x_test)
print('Confusion Matrix')
print(confusion_matrix(y_test,y_pred))
print('Accuracy Metrics')
print(classification_report(y_test,y_pred))
--------------------------------------------------------------
import numpy as np
import pandas as pd
import numpy as genfromtxt
from sklearn import linear_model
from sklearn.model_selection import train_test_split,cross_val_score
from matplotlib import pyplot as plt
from matplotlib import colors
df=pd.read_csv("data_for_regression_coefficient.csv")
df.head(10)
x=df['x'].values.reshape(df['x'].count(),1)
y=df['y'].values.reshape(df['y'].count(),1)
print(x.shape)
print(y.shape)
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=9892
)
linear_model
-------------------------
question to find the line of regression and estimate the height of the son when the
height of the father is 164.
data=pd.read_csv('heights_of_father_and_son.csv')
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=9892
)
y=0.61023622*164+66.11417323
------------------------------------------------------------------------
pca
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(data)
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Assuming you have a dataset X, replace the following line with your data
# For example, X = np.random.rand(100, 3) # 100 samples, 3 features
plt.subplot(1, 2, 1)
plt.scatter(X_std[:, 0], X_std[:, 1])
plt.title('Original Data')
plt.xlabel('Standardized Feature 1')
plt.ylabel('Standardized Feature 2')
#plt.subplot(1, 2, 2)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.datasets import fetch_openml
import pandas as pd
# Load the MNIST dataset (you can replace this with your own image dataset)
mnist = fetch_openml('mnist_784')
images = mnist.data.astype(float)
labels = mnist.target.astype(int)
# Convert to DataFrame
df = pd.DataFrame(flattened_images)
# Apply PCA
n_components = 50 # You can adjust this number based on your requirements
pca = PCA(n_components=n_components)
flattened_images_pca = pca.fit_transform(df.values)
# Visualize original and PCA-transformed images
fig, axes = plt.subplots(2, 10, figsize=(10, 2))
for i in range(10):
axes[0, i].imshow(flattened_images[i].reshape(28, 28), cmap='gray')
axes[0, i].axis('off')
axes[1, i].imshow(pca.inverse_transform(flattened_images_pca[i]).reshape(28,
28), cmap='gray')
axes[1, i].axis('off')
plt.show()