0% found this document useful (0 votes)
10 views

ML Lab Programs

Uploaded by

lakshmiv91163
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
10 views

ML Lab Programs

Uploaded by

lakshmiv91163
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 15

#ML Lab programs

#lab_1_single_instance
import csv
import numpy as np
with open('training-data-1.csv','r') as file:
reader=csv.reader(file)
x1=[row for row in reader]
x=np.array(x1)
r,c=x.shape
print("The no of rows & cols are:",r,"and",c)
print("The dataset is:\n",x)
s=[]
s=np.empty(c-1,dtype=object)
for i in range(c-1):
s[i]="\u03A6"
print("Initial specific hypothesis:\n",s)
for i in range(c-1):
s[i]=x[0,i]
print("Hypothesis after processing the training data:\n",s)
for i in range(1,r):
if x[i,c-1]=='NO':
continue
else:
for j in range(c-1):
if x[i,j]!=s[j]:
s[j]='?'
print("The Intermediate Hypothesis:\n",s)
print("The Final MAximal Specific ypothesis:\n",s)
print("ENter new instance value")
new=[]
new=np.empty(c-1,dtype=object)
print("Enter Attribute value for sky,airtemp,humidity,wing,water,and forecast")
for i in range(c-1):
new[i]=input("Enter attribute value")
print("\nThe given instance is:",new)
flag=1
for i in range(c-1):
if s[i]!='?' and s[i]!=new[i]:
flag=0
break
print("\nThe assigned label for Enjoying Water Sort is")
if flag==1:
print("Yes means person enjoys water sport")
else:
print("No means person will not enjoys water sport")

#lab_1b_Multiple_Instance
import csv
import numpy as np
with open('training-data-1.csv','r') as file:
reader=csv.reader(file)
x1=[row for row in reader]
x=np.array(x1)
r,c=x.shape
print("The no of rows & cols are:",r,"and",c)
print("The dataset is:\n",x)
s=[]
s=np.empty(c-1,dtype=object)
for i in range(c-1):
s[i]="\u03A6"
print("Initial specific hypothesis:\n",s)
for i in range(c-1):
s[i]=x[0,i]
print("Hypothesis after processing the training data:\n",s)
for i in range(1,r):
if x[i,c-1]=='NO':
continue
else:
for j in range(c-1):
if x[i,j]!=s[j]:
s[j]='?'
print("The Intermediate Hypothesis:\n",s)
print("The Final MAximal Specific ypothesis:\n",s)
print("Read the set of instances from csv file to assign labels:")
with open('new-training-data.csv','r') as file1:
reader=csv.reader(file1)
x2=[row for row in reader]
new=np.array(x2)
r1,c1=x.shape
print("the sixe of new sample is ",r1,"rows and",c1,"cols")
print("New data",new)
print("\n\n")
new1=new.tolist()
for i in range(0,r1):
flag=1
for j in range(0,c1-1):
if s[j]!='?' and s[j]!=new1[i][j]:
flag=0
break
if flag==1:
new1[i].append("Yes")
else:
new1[i].append("No")
print("The assigned labels for set of instances read from csv file are:")
print("\n",new1)

with open('data_1b.csv',mode='w',newline='') as file:


writer=csv.writer(file)
writer.writerows(new1)

#lab_2
import csv
import numpy as np

with open('finds.csv ', 'r') as file:


reader = csv.reader(file)
x1 = [row for row in reader]
x = np.array(x1)
r,c = x.shape
print("The number of rows and columns are: ", r,"and", c-1)
print("The daatset is: \n",x)
s = np.empty(c-1, dtype=object)
for i in range(c-1): s[i] = "\u03A6"
print("\n Most specific hypotheis: \n",s)
g = np.empty(c-1, dtype=object)
for i in range(c-1): g[i] = "?"
print("\n Most general hypotheis: \n",g)
for i in range(c-1): s[i] = x[0,i]
Flag = 0
for i in range(1,r):
#print("\nSpecific hypotheis: \n",s)
if x[i,c-1] == 'Yes':
for j in range(c-1):
if x[i,j] != s[j]:
s[j] = '?'
print("\nIntermediate specific hypothesis: \n",s)
if Flag == 0:
Flag =1
continue
else:
r1 = np.shape(g)[0]
for i1 in range(r1):
for j1 in range(c-1):
if g[i1,j1] != x[i,j1] and g[i1,j1] != "?":
g = np.delete(g, i1, axis=0)

else:
for j in range(c-1):
g1 = np.empty(c-1, dtype=object)
if s[j] == '?':
continue
elif x[i,j] != s[j]:
for l in range(c-1): g1[l] = "?"
g1[j] = s[j]
g = np.vstack([g, g1])
g = np.delete(g, (0), axis=0)
print("\nIntermediate general hypothesis\n",g)

print("\nSpecific hypotheis: \n",s)


print("\nGeneral hypotheis: \n",g)

nr,nc=g.shape
print("Enter new instance value")
new=[]
new = np.empty(c-1, dtype=object)
print("ENTER ATTRIBUTE VALUES for SKY,AIRTEMP,HUMIDITY,WIND,WATER, and FORECAST")
for i in range(c-1):
new[i]=input("enter attribute values")
print("\nthe given instance is",new)
flag=1
for i in range(c-1):
if s[i]!='?' and s[i]!=new[i]:
flag=0
break
for i in range(nr):
flag1=1
for j in range(nc-1):
if g[i][j]!='?' and g[i][j]!=new[j]:
flag1=0
break
if flag1==1:
break
print("\n The assigned label for ENJOYING WATER SPORT is")
if flag==1 or flag1==1:
print("YES")
else:
print("NO")

#lab_3_naive_bayesian
import csv
print('CONSTUCTION OF NAIVE BAYESIAN MODEL (Calculation of Probalities)')
with open('playtennis.txt') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count=yes_count=0
sunny_yes=sunny_no=0
rainy_yes=rainy_no=0
overcast_yes=overcast_no=0
hot_yes=hot_no=0
cool_yes=cool_no=0
mild_yes=mild_no=0
high_yes=high_no=0
normal_yes=normal_no=0
weak_yes=weak_no=0
strong_yes=strong_no=0
for row in csv_reader:
line_count = line_count + 1
if(row[5]=="yes"):
yes_count=yes_count+1
if(row[1]=='Sunny' and row[5]=="yes"):
sunny_yes=sunny_yes+1
if (row[1] == 'Sunny' and row[5] == "no"):
sunny_no = sunny_no + 1
if(row[1]=='Rainy' and row[5]=="yes"):
rainy_yes=rainy_yes+1
if (row[1] == 'Rainy' and row[5] == "no"):
rainy_no = rainy_no + 1
if(row[1]=='Overcast' and row[5]=="yes"):
overcast_yes=overcast_yes+1
if (row[1] == 'Overcast' and row[5] == "no"):
overcast_no = overcast_no + 1
if(row[2]=='Hot' and row[5]=="yes"):
hot_yes=hot_yes+1
if (row[2] == 'Hot' and row[5] == "no"):
hot_no = hot_no + 1
if(row[2]=='Cool' and row[5]=="yes"):
cool_yes=cool_yes+1
if (row[2] == 'Cool' and row[5] == "no"):
cool_no = cool_no + 1
if(row[2]=='Mild' and row[5]=="yes"):
mild_yes=mild_yes+1
if (row[2] == 'Mild' and row[5] == "no"):
mild_no = mild_no + 1
if(row[3]=='High' and row[5]=="yes"):
high_yes=high_yes+1
if (row[3] == 'High' and row[5] == "no"):
high_no = high_no + 1
if(row[3]=='Normal' and row[5]=="yes"):
normal_yes=normal_yes+1
if (row[3] == 'Normal' and row[5] == "no"):
normal_no = normal_no + 1
if(row[4]=='Weak' and row[5]=="yes"):
weak_yes=weak_yes+1
if (row[4] == 'Weak' and row[5] == "no"):
weak_no = weak_no + 1
if (row[4] == 'Strong' and row[5] == "yes"):
strong_yes=strong_yes+1
if (row[4] == 'Strong' and row[5] == "no"):
strong_no = strong_no + 1
no_count = line_count - yes_count
p_playtennis_yes=yes_count/line_count
p_playtennis_no=no_count/line_count
p_sunny_yes=sunny_yes/yes_count
p_sunny_no=sunny_no/no_count
p_rainy_yes=rainy_yes/yes_count
p_rainy_no=rainy_no/no_count
p_overcast_yes=overcast_yes/yes_count
p_overcast_no=overcast_no/no_count
p_hot_yes=hot_yes/yes_count
p_hot_no=hot_no/no_count
p_cool_yes=cool_yes/yes_count
p_cool_no=cool_no/no_count
p_mild_yes=mild_yes/yes_count
p_mild_no=mild_no/no_count
p_high_yes=high_yes/yes_count
p_high_no=high_no/no_count
p_normal_yes=normal_yes/yes_count
p_normal_no=normal_no/no_count
p_weak_yes=weak_yes/yes_count
p_weak_no=weak_no/no_count
p_strong_yes=strong_yes/yes_count
p_strong_no=strong_no/no_count
print()
print("probability of Play_Tennis with label YES:",p_playtennis_yes)
print("probability of Play_Tennis with label NO:",p_playtennis_no)
print()
print('Probalitiies for OUTLOOK attribute')
print("probability of outlook is sunny with label yes:",p_sunny_yes)
print("probability of outlook is sunny with label NO",p_sunny_no)
print("probability of outlook is Rainy with label yes:",p_rainy_yes)
print("probability of outlook is Rainy with label NO",p_rainy_no)
print("probability of outlook is overcast with label yes:",p_overcast_yes)
print("probability of outlook is overcast with label NO",p_overcast_no)
print()
print('Probalitiies for Temperature attribute')
print("probability of Temperature is hot with label yes:",p_hot_yes)
print("probability of Temperature is hot with label NO",p_hot_no)
print("probability of Temperature is cool with label yes:",p_cool_yes)
print("probability of Temperature is cool with label NO",p_cool_no)
print("probability of Temperature is mild with label yes:",p_mild_yes)
print("probability of Temperature is mild with label NO",p_mild_no)
print()
print('Probalitiies for HUMIDITY attribute')
print("probability of Humidity is high with label yes:",p_high_yes)
print("probability of Humidity is high with label NO",p_high_no)
print("probability of Humidity is normal with label yes:",p_normal_yes)
print("probability of Humidity is normal with label NO",p_normal_no)
print()
print('Probalitiies for WIND attribute')
print("probability of WIND is weak with label yes:",p_weak_yes)
print("probability of WIND is weak with label NO",p_weak_no)
print("probability of WIND is strong with label yes:",p_strong_yes)
print("probability of WIND is strong with label NO",p_strong_no)
print()
print('Assigning label based Naive BAyesian classifier')
rec=str(input("Enter OUTLOOK(x1), TEMPERATURE(x2), HUMIDITY(x3), WIND(x4) in this
sequence separated by comma "))
reclist=rec.split(",")
with open('playtennis.txt') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count=yes_count=0
x1_yes=x1_no=0
x2_yes=x2_no=0
x3_yes=x3_no=0
x4_yes=x4_no=0
for row in csv_reader:
line_count = line_count + 1
if(row[5]=="yes"):
yes_count=yes_count+1
if(row[1]==reclist[0] and row[5]=="yes"):
x1_yes=x1_yes+1
if (row[1] == reclist[0] and row[5] == "no"):
x1_no = x1_no + 1
if (row[2] == reclist[1] and row[5] == "yes"):
x2_yes = x2_yes + 1
if (row[2] == reclist[1] and row[5] == "no"):
x2_no = x2_no + 1
if (row[3] == reclist[2] and row[5] == "yes"):
x3_yes = x3_yes + 1
if (row[3] == reclist[2] and row[5] == "no"):
x3_no = x3_no + 1
if (row[4] == reclist[3] and row[5] == "yes"):
x4_yes = x4_yes + 1
if (row[4] == reclist[3] and row[5] == "no"):
x4_no = x4_no + 1
no_count = line_count - yes_count
x1_yes=x1_yes/yes_count
x1_no=x1_no/no_count
x2_yes=x2_yes/yes_count
x2_no=x2_no/no_count
x3_yes=x3_yes/yes_count
x3_no=x3_no/no_count
x4_yes=x4_yes/yes_count
x4_no=x4_no/no_count
yes_x=x1_yes*x2_yes*x3_yes*x4_yes*yes_count/line_count
no_x=x1_no*x2_no*x3_no*x4_no*no_count/line_count
if(yes_x>no_x):
print(f'The playing tennis {reclist[0]}, {reclist[1]}, {reclist[2]},
{reclist[3]} is allowed means assigned label is YES')
else:
print(f'The playing tennis {reclist[0]}, {reclist[1]}, {reclist[2]},
{reclist[3]} is not possible means assigned label is NO')

#lab_4_naive_bayesian_sk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
data=pd.read_csv("play_tennis.csv")
print("The given data set is:")
print(data)
predictors=data.iloc[:,0:4]
target=data.iloc[:,4]
predictors_train,predictors_test,target_train,target_test=train_test_split(predictors,t
arget,test_size=0.3,random_state=123)
gnb=GaussianNB()
model=gnb.fit(predictors_train,target_train)
prediction=model.predict(predictors_test)
print("ACCURACY of Classifier")
accuracy_score(target_test,prediction,normalize=True)

#lab_5_dtc
import pandas as pd
import numpy as np
from pprint import pprint
dataset = pd.read_csv('play_tennisDTC1.csv',
names=['Outlook','Temperature','humidity','Wind','PlayTennis'])
#Construction of decision tree
def entropy(target_col):
elements,counts = np.unique(target_col,return_counts = True)
entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for
i in range(len(elements))])
return entropy

def InfoGain(data,split_attribute_name,target_name="PlayTennis"):

#Calculate the entropy of the total dataset


total_entropy = entropy(data[target_name])
##Calculate the entropy of the dataset
#Calculate the values and the corresponding counts for the split attribute
vals,counts= np.unique(data[split_attribute_name],return_counts=True)

#Calculate the weighted entropy


Weighted_Entropy =
np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[
i]).dropna()[target_name]) for i in range(len(vals))])

#Calculate the information gain


Information_Gain = total_entropy - Weighted_Entropy
print(split_attribute_name,vals,counts,Information_Gain)
return Information_Gain
def ID3(data,originaldata,features,target_attribute_name="PlayTennis",parent_node_class
= None):

if len(np.unique(data[target_attribute_name])) <= 1:
return np.unique(data[target_attribute_name])[0]

elif len(data)==0:
return
np.unique(originaldata[target_attribute_name])[np.argmax(np.unique(originaldata[target_
attribute_name],return_counts=True)[1])]

elif len(features) ==0:


return parent_node_class

#If none of the above holds true, grow the tree!

else:
#Set the default value for this node --> The mode target feature value of the
current node
parent_node_class =
np.unique(data[target_attribute_name])[np.argmax(np.unique(data[target_attribute_name],
return_counts=True)[1])]

#Select the feature which best splits the dataset


item_values = [InfoGain(data,feature,target_attribute_name) for feature in
features] #Return the information gain values for the features in the dataset
best_feature_index = np.argmax(item_values)
best_feature = features[best_feature_index]

#Create the tree structure. The root gets the name of the feature
(best_feature) with the maximum information
#gain in the first run
tree = {best_feature:{}}

#Remove the feature with the best inforamtion gain from the feature space
features = [i for i in features if i != best_feature]

#Grow a branch under the root node for each possible value of the root node
feature

for value in np.unique(data[best_feature]):


value = value
#Split the dataset along the value of the feature with the largest
information gain and therwith create sub_datasets
sub_data = data.where(data[best_feature] == value).dropna()
#Call the ID3 algorithm for each of those sub_datasets with the new
parameters --> Here the recursion comes in!

subtree =
ID3(sub_data,dataset,features,target_attribute_name,parent_node_class)
print("\n ")

#Add the sub tree, grown from the sub_dataset to the tree under the root
node
tree[best_feature][value] = subtree

return(tree)
def predict(query,tree,default = 1):
for key in list(query.keys()):
if key in list(tree.keys()):
try:
result = tree[key][query[key]]
except:
return default
result = tree[key][query[key]]
if isinstance(result,dict):
return predict(query,result)
else:
return result
def train_test_split(dataset):
training_data = dataset.iloc[:14].reset_index(drop=True)
#We drop the index respectively relabel the index
#starting form 0, because we do not want to run into errors regarding the row
labels / indexes
testing_data = dataset.iloc[14:].reset_index(drop=True)
return training_data,testing_data
training_data,testing_data = train_test_split(dataset)
def test(data,tree):
#Create new query instances by simply removing the target feature column from the
original dataset and
#convert it to a dictionary
queries = data.iloc[:,:-1].to_dict(orient = "records")

predicted = pd.DataFrame(columns=["predicted"])

for i in range(len(data)):
predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)
print('\n The prediction accuracy is: ',(np.sum(predicted["predicted"] ==
data["PlayTennis"])/len(data))*100,'%')
tree = ID3(training_data,training_data,training_data.columns[:-1])
print("The final Resultant Decision Tree")
pprint(tree)
test(testing_data,tree)
#Assigning a label

dataset1 = pd.read_csv('newsamples.csv',
names=['Outlook','Temperature','humidity','Wind','PlayTennis'])
print()
print()
def assign(data1,tree):
queries = data1.iloc[:,:-1].to_dict(orient = "records")
print('the given samples to assign a label are')
print(queries)
print()
predicted = pd.DataFrame(columns=["predicted"])
print('predicted labels for each new sample are')
for i in range(len(data1)):
predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0)
print(predicted)
assign(dataset1,tree)

#lab_7_knn
# K-Nearest neighbor implementation

import math

# calculate the Euclidean distance between two vectors


def euclidean_distance(row1, row2):
distance = 0.0
for i in range(n1-1):
distance += (row1[i] - row2[i])**2
return math.sqrt(distance)

# Locate the most similar neighbors


def get_neighbors(train, test_row, num_neighbors):
distances = list()
for train_row in train:
dist = euclidean_distance(test_row, train_row)
distances.append((train_row, dist))
print('distances from neighbors to given data point')
print(distances)
distances.sort(key=lambda tup: tup[1])
neighbors = list()
for i in range(num_neighbors):
neighbors.append(distances[i][0])

return neighbors

# Make a classification prediction with neighbors


def KNN_classification(train, test_row, num_neighbors):
neighbors = get_neighbors(train, test_row, num_neighbors)
print()
print('Neighbors of given data point are\n')
print(neighbors)
output_values = [row[-1] for row in neighbors]
prediction = max(set(output_values), key=output_values.count)
return prediction

dataset=[];
n=int(input('enter no of data points'));
n1=int(input('enter no of dimensions in a dataset along with class label attribute'));
print('attr1,aatr2,.....attrn(class label)')
for i in range(n):
print('enter attribute values for point',i);
dataset.append([]);
counter=0;
while counter<n1:
b=float(input('enter value '));
counter+=1;
dataset[i].append(b);
print('Given points in data set are')
print(dataset);
print('Enter a new data point to assign a label')
p=[];
i=0
n=n1-1
while i<n:
k=float(input('enter data point elements'))
p.append(k)
i=i+1
print('Enter K no.of neighbors')
k=int(input())
prediction = KNN_classification(dataset,p, k)
print()
print('The assigned label for the given data point is is',prediction)

#lab_8a_knn
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
data=pd.read_csv('iris_dataset_ann.csv')
predictors=data.iloc[:,0:4]
print('Features of training dataset\n', predictors)
print('\n')
target=data.iloc[:,4]
print('Class of IRIS flower \n', target)
print('\n')
predictors_train,predictors_test,target_train,target_test =
train_test_split(predictors,target,test_size=0.3,random_state=None)
knn=KNeighborsClassifier(n_neighbors=5)
model=knn.fit(predictors_train,target_train)
accuracy=knn.score(predictors_test,target_test)
print('Accuracy of KNN classifier on IRIS data set',accuracy)

#lab_8b_svm
#SVM Classifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
data=pd.read_csv('iris_dataset_ann.csv')
print('Features supplied for classification')
predictors=data.iloc[:,0:4]
print('\n',predictors)
print('Target class labels')
target=data.iloc[:,4]
print('\n',target)
predictors_train,predictors_test,target_train,target_test=train_test_split(predictors,t
arget,test_size=0.3,random_state=123)
svm=svm.SVC(kernel='linear')
model=svm.fit(predictors_train,target_train)
prediction=svm.predict(predictors_test)
print('Accuracy of SVM classifier',
accuracy_score(target_test,prediction,normalize=True))

#lab_9_Kmeans_clustering
import math
import sys

m=int(input("no.of data points: "))


n=int(input("no.of dimensions per a data point: "))

#to read data points and centroid

l=[]
for i in range(m):
k=input("enter a data point with "+str(n)+" dimensions: ").split()
k=[int(i) for i in k]
if(len(k)==n):
l=l+[k]
else :
print("Data point with incorrect dimensions...!")
sys.exit()
print('\n\nThe given data points are',l)

c=int(input("enter no.of clusters: "))


g=[]
for i in range(c):
g=g+[l[i]]
print("\nInitialized centeriod matrix:")
print(g)

#calcute first distance and group matrix

dist = [[0] * c for i in range(m)]


print('\nintial distance matrix',dist)

for i in range(m):
for j in range(c):
t=0
for k in range(n):
t=t+(l[i][k]-g[j][k])*(l[i][k]-g[j][k])
t=math.sqrt(t)
dist[i][j]=t
print("\nDistance matrix after 1st iteration: ")
print(dist)

g = [[0] * n for i in range(c)]


grp = [[0] * c for i in range(m)]
grp1 = [[0] * c for i in range(m)]
t=[0 for i in range(c)]
for i in range(m):
for j in range(c):
if(dist[i][j]==min(dist[i])):
grp[i][j]= j+1
for k in range(n):
g[j][k]+=l[i][k]
t[j]+=1
else:
grp[i][j]=0

for j in range(c):
for k in range(n):
g[j][k]/=t[j]
print("\nCluster Membership matrix after 1st iteration: ")
print(grp)
print("\nupdated centeriod matrix after 1st iteration:")
print(g)

#for next iterations

while(grp1!=grp) :
for i in range(m):
for j in range(c):
t=0
for k in range(n):
t=t+(l[i][k]-g[j][k])*(l[i][k]-g[j][k])
t=math.sqrt(t)
dist[i][j]=t
print("\nDistance matrix in next iteration: ")
print(dist)

g = [[0] * n for i in range(c)]


grp1=grp
grp = [[0] * c for i in range(m)]
t=[0 for i in range(c)]
for i in range(m):
for j in range(c):
if(dist[i][j]==min(dist[i])):
grp[i][j]= j+1
for k in range(n):
g[j][k]+=l[i][k]
t[j]+=1
else:
grp[i][j]=0

for j in range(c):
for k in range(n):
g[j][k]/=t[j]
print("\nCluster Membership after next iteration: ")
print(grp)
print("\n\n\nUpdated centroid matrix after next iteration:")
print(g)

#lab_10_Linear_regression
#Linear Regression implementation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

def estimate_coef(x, y):


# number of observations/points
n = np.size(x)

# mean of x and y
mean_x = np.mean(x)
mean_y = np.mean(y)
print('\n mean of salary is',mean_x)
print('\n mean of loan amount is',mean_y)
print('\n')
cov=0
for i in range(n):
x_d=x[i]-mean_x
y_d=y[i]-mean_y
xy=x_d * y_d
cov=cov+xy
var=0
for i in range(n):
x_d=x[i]-mean_x
xx= x_d * x_d
var=var+xx

# calculating regression coefficients


b1 = cov / var
b0 = mean_y - b1 * mean_x

return (b0,b1)

def plot_regression_line(x, y, b):


# plotting the actual points as scatter plot
plt.scatter(x, y, color = "m",
marker = "o", s = 30)

# predicted response vector


y_pred = b[0] + b[1] * x

# plotting the regression line


plt.plot(x, y_pred, color = "g")

# putting labels
plt.xlabel('Salary in lakhs')
plt.ylabel('Loan amount in Lakhs')

# function to show plot


plt.show()

#Import the dataset and define the features

dataframe = pd.read_csv('reg-dataset.csv', names=['salary','loan_amount_sanctioned'])


print(dataframe)
print('\n')
sal = dataframe.iloc[:,0]
print('The values extraced for SALARY column in lakhs from dataframe')
print(sal)
print('\n')
loanamount = dataframe.iloc[:,1]
print('The values extraced for LOAN AMOUNT in Lakhs column from dataframe')
print(loanamount)
print('\n')

b = estimate_coef(sal, loanamount)
print("Estimated coefficients:\n b0 = {} \
\n b1 = {}".format(b[0], b[1]))
print('\n')

print('plot of a straight line')


# plotting regression line
plot_regression_line(sal, loanamount, b)
print('\n')
print('Enter the salary of a person to predict the loan amount to be sanctioned')
salary1=int(input())
predicted_loan_amount = b[0] + b[1] * salary1
print('loan amount predicted for the entered salary is by Linear Regression: ')
print(predicted_loan_amount)

You might also like