Pattern Recognition
Pattern Recognition
LAB FILE
FOR
List of Experiments:
1
WAP reading csv file and image file Reading CSV file
2
WAP pre-processing (cleaning of data and data redundancy) of data
3 Draw Scatter Plot and Histograms from Multivariate data set
4
WAP to implement Bayes theorem in python
5 Implement naïve bayes on Hear Disease Data Set.
6
WAP to find correlation and covariance between features of data
7 WAP to implement PCA
9 WAP to implement univariate feature selection for Wine Quality data set
10 Write a program to implement k-Nearest Neighbour algorithm to classify the iris data set
11 Write a program to implement Support vector machine algorithm to classify the Heart Disease data set
12 Write a program to implement Decision tree algorithm to classify the Wine Quality data set
13 Write a program to implement Artificial neural network algorithm to classify the iris data set
17
18
19
20
21
22
23
# Calculating Mean
def mean(numbers):
return sum(numbers) / float(len(numbers))
def MeanAndStdDev(mydata):
info = [(mean(attribute), std_dev(attribute)) for attribute in zip(*mydata)]
# eg: list = [ [a, b, c], [m, n, o], [x, y, z]]
# here mean of 1st attribute =(a + m+x), mean of 2nd attribute = (b + n+y)/3
# delete summaries of last class
del info[-1]
return info
# Accuracy score
def accuracy_rate(test, predictions):
correct = 0
for i in range(len(test)):
if test[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(test))) * 100.0
# driver code
# prepare model
info = MeanAndStdDevForClass(train_data)
# test model
predictions = getPredictions(info, test_data)
accuracy = accuracy_rate(test_data, predictions)
print("Accuracy of your model is: ", accuracy)
# visualization tools
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pp
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
df = pd.read_csv("../input/heart-disease-uci/heart.csv")
df.sample(5)
df.info()
report = pp.ProfileReport(df)
report.to_file("report.html")
report
task 6: WAP to find correlation and covariance between features of
data
correlation between features of data
The Pearson correlation coefficient (named for Karl Pearson) can be used to summarize the strength of
the linear relationship between two data samples.
The Pearson’s correlation coefficient is calculated as the covariance of the two variables divided by the
product of the standard deviation of each data sample. It is the normalization of the covariance between
the two variables to give an interpretable score.
Pearson's correlation coefficient = covariance(X, Y) / (stdv(X) * stdv(Y))
We can calculate the correlation between the two variables in our test problem.
The complete example is listed below
# calculate the Pearson's correlation between two variables
from numpy.random import randn
from numpy.random import seed
from scipy.stats import pearsonr
# seed random number generator
seed(1)
# prepare data
data1 = 20 * randn(1000) + 100
data2 = data1 + (10 * randn(1000) + 50)
# calculate Pearson's correlation
corr, _ = pearsonr(data1, data2)
print('Pearsons correlation: %.3f' % corr)
Import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
# obtain svd
U, S, V = np.linalg.svd(img)
Here we will predict the quality of wine on the basis of giving features. We use the wine quality dataset
from Kaggle. This dataset has the fundamental features which are responsible for affecting the quality of
the wine. By the use of several Machine learning models, we will predict the quality of the wine. Here
we will only deal with the white type wine quality, we use classification techniques to check further the
quality of the wine i.e. is it good or bed
In this dataset, classes are ordered, but it was not balanced. Here, red wine instances are present at a
high rate and white wine instances are less than red.
These are the name of Features from the dataset
1. type
2. fixed acidity
3. volatile acidity
4. citric acid
5. residual sugar
6. chlorides
7. free sulfur dioxide
8. total sulfur dioxide
9. density
10. pH
11. sulphates
12. alcohol
13. quality
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
# loading the data
Dataframe = pd.read_csv(R'D:\\xdatasets\winequalityN.csv')
# show rows and columns
Dataframe.head()
# getting info.
Dataframe.info()
Dataframe.describe()
# null value check
Dataframe.isnull().sum()
# plot pairplot
sb.pairplot(Dataframe)
#show graph
plt.show()
#plot histogram
Dataframe.hist(bins=20,figsize=(10,10))
#plot showing
plt.show()
plt.figure(figsize=[15,6])
plt.bar(df['quality'],df['alcohol'])
plt.xlabel('quality')
plt.ylabel('alcohol')
plt.show()
# correlation by visualization
plt.figure(figsize=[18,7])
# plot correlation
sb.heatmap(Dataframe.corr(),annot=True)
plt.show()
colm = []
# loop for columns
for i in range(len(Dataframe.corr().keys())):
# loop for rows
for j in range(j):
if abs(Dataframe.corr().iloc[i,j]) > 0.7:
colm = Dataframe.corr().columns[i]
# drop colum
new_df = Dataframe.drop('total sulfur dioxide',axis = 1)
new_df.update(new_df.fillna(new_df.mean()))
# no of categorical columns
cat = new_df.select_dtypes(include='O')
# create dummies of categorical columns
df_dummies = pd.get_dummies(new_df,drop_first = True)
print(df_dummies)
df_dummies['best quality']=[1 if x>=7 else 0 for x in Dataframe.quality]
print(df_dummies)
# import libraries
from sklearn.preprocessing import train_test_split
# independent variables
x = df_dummies.drop(['quality','best quality'],axis=1)
# dependent variable
y = df_dummies['best quality']
# creating train test splits
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2,random_state=40)
# code
# import libraries
from sklearn.preprocessing import MinMaxScaler
# creating scaler scale var.
norm = MinMaxScaler()
# fit the scal
norm_fit = norm.fit(xtrain)
# transfromation of trainig data
scal_xtrain = norm_fit.transform(xtrain)
# transformation of testing data
scal_xtest = norm_fit.transform(xtest)
print(scal_xtrain)
# code
#import libraries
from sklearn.ensemble import RandomForestClassifier
# for error checking
from sklearn.matrics import mean_squared_error
from sklearn.metrics import classification_report
# create model variable
rnd = RandomForestClassifier()
# fit the model
fit_rnd = rnd.fit(new_xtrain,ytrain)
# checking the accuracy score
rnd_score = rnd.score(new_xtest,ytest)
print('score of model is : ',rnd_score)
print('.................................')
print('calculating the error')
# checking mean_squared error
MSE = mean_squared_error(ytest,y_predict)
# checking root mean squared error
RMSE = np.sqrt(MSE)
print('mean squared error is : ',MSE)
print('root mean squared error is : ',RMSE)
print(classification_report(ytest,x_predict))
# code
x_predict = list(rnd.predict(xtest))
df = {'predicted':x_predict,'orignal':ytest}
pd.DataFrame(df).head(10)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
error = []
dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,
filled=True, rounded=True,
special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('diabetes.png')
Image(graph.create_png())
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=3)
sns.lmplot('PetalLengthCm', 'PetalWidthCm',
data=data,
fit_reg=False,
hue="Species",
scatter_kws={"marker": "D",
"s": 50})
plt.title('PetalLength vs PetalWidth')
sns.lmplot('SepalLengthCm', 'PetalLengthCm',
data=data,
fit_reg=False,
hue="Species",
scatter_kws={"marker": "D",
"s": 50})
plt.title('SepalLength vs PetalLength')
sns.lmplot('SepalWidthCm', 'PetalWidthCm',
data=data,
fit_reg=False,
hue="Species",
scatter_kws={"marker": "D",
"s": 50})
plt.title('SepalWidth vs PetalWidth')
plt.show()
print(data["Species"].unique())
data.loc[data["Species"]=="Iris-setosa","Species"]=0
data.loc[data["Species"]=="Iris-versicolor","Species"]=1
data.loc[data["Species"]=="Iris-virginica","Species"]=2
print(data.head())
data=data.iloc[np.random.permutation(len(data))]
print(data.head())
X=data.iloc[:,1:5].values
y=data.iloc[:,5].values
print("Shape of X",X.shape)
print("Shape of y",y.shape)
print("Examples of X\n",X[:3])
print("Examples of y\n",y[:3])
X_normalized=normalize(X,axis=0)
print("Examples of X_normalised\n",X_normalized[:3])
#Creating train,test and validation data
'''
80% -- train data
20% -- test data
'''
total_length=len(data)
train_length=int(0.8*total_length)
test_length=int(0.2*total_length)
X_train=X_normalized[:train_length]
X_test=X_normalized[train_length:]
y_train=y[:train_length]
y_test=y[train_length:]
accuracy=np.sum(y_label==predict_label)/length * 100
print("Accuracy of the dataset",accuracy
#number of clusters
K=3
diff = 1
j=0
while(diff!=0):
XD=X
i=1
for index1,row_c in Centroids.iterrows():
ED=[]
for index2,row_d in XD.iterrows():
d1=(row_c["ApplicantIncome"]-row_d["ApplicantIncome"])**2
d2=(row_c["LoanAmount"]-row_d["LoanAmount"])**2
d=np.sqrt(d1+d2)
ED.append(d)
X[i]=ED
i=i+1
C=[]
for index,row in X.iterrows():
min_dist=row[1]
pos=1
for i in range(K):
if row[i+1] < min_dist:
min_dist = row[i+1]
pos=i+1
C.append(pos)
X["Cluster"]=C
Centroids_new = X.groupby(["Cluster"]).mean()[["LoanAmount","ApplicantIncome"]]
if j == 0:
diff=1
j=j+1
else:
diff = (Centroids_new['LoanAmount'] - Centroids['LoanAmount']).sum() +
(Centroids_new['ApplicantIncome'] - Centroids['ApplicantIncome']).sum()
print(diff.sum())
Centroids = X.groupby(["Cluster"]).mean()[["LoanAmount","ApplicantIncome"]]
color=['blue','green','cyan']
for k in range(K):
data=X[X["Cluster"]==k+1]
plt.scatter(data["ApplicantIncome"],data["LoanAmount"],c=color[k])
plt.scatter(Centroids["ApplicantIncome"],Centroids["LoanAmount"],c='red')
plt.xlabel('Income')
plt.ylabel('Loan Amount (In Thousands)')
plt.show()
G = nx.Graph()
G.add_edges_from([('A', 'B'), ('A', 'K'), ('B', 'K'), ('A', 'C'),
('B', 'C'), ('C', 'F'), ('F', 'G'), ('C', 'E'),
('E', 'F'), ('E', 'D'), ('E', 'H'), ('H', 'I'), ('I', 'J')])
print(nx.shortest_path(G, 'A'))
# returns dictionary of shortest paths from A to all other nodes
print(int(nx.shortest_path_length(G, 'A')))
# returns dictionary of shortest path length from A to all other nodes