0% found this document useful (0 votes)
25 views

Data Science Lab Experiments

Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
25 views

Data Science Lab Experiments

Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 32

Experiment 1

AIM: Write a program to demonstrate a) arrays b) array indexing such as


slicing, integer array indexing and Boolean array indexing along with their
basic operations in NumPy.
A) Simple array indexing
list1 = [1, 2, 3, 4, 5, 6]
list2 = [10, 9, 8, 7, 6, 5]
print([list1]*3)
print(list1+list2)
print(list1[2:4])
print(list1[:4])
print(list2[-3:-1])
print(list2[-3:])
B) Array operations
list1 = [1, 2, 3, 4, 5, 6]
list2 = [10, 9, 8, 7, 6, 5]
result=[a * b for a,b in zip(list1,list2)]
print(result)
C) Numpy boolean operation
import numpy as np
A = np.array([4, 7, 3, 4, 2, 8])
print(A == 4)
D) Numpy boolean operation
import numpy as np
A = np.array([4, 7, 3, 4, 2, 8])
print(A < 5)
E) 2 – dimensional array
import numpy as np
B = np.array([[42,56,89,65],
[99, 88, 42, 12],
[55, 42, 17, 18]])
print (B>=42)
F) Higher dimensions
import numpy as np
A = np.array([
[12, 13, 14, 12, 16, 14, 11, 10, 9],
[11, 14, 12, 15, 15, 16, 10, 12, 11],
[10, 12, 12, 15, 14, 16, 10, 12, 12],
[ 9, 11, 16, 15, 14, 16, 15, 12, 10],
[12, 11, 16, 14, 10, 12, 16, 12, 13],
[10, 15, 16, 14, 14, 14, 16, 15, 12],
[13, 17, 14, 10, 14, 11, 14, 15, 10],
[10, 16, 12, 14, 11, 12, 14, 18, 11],
[10, 19, 12, 14, 11, 12, 14, 18, 10],
[14, 22, 17, 19, 16, 17, 18, 17, 13],
[10, 16, 12, 14, 11, 12, 14, 18, 11],
[10, 16, 12, 14, 11, 12, 14, 18, 11],
[10, 19, 12, 14, 11, 12, 14, 18, 10],
[14, 22, 12, 14, 11, 12, 14, 17, 13],
[10, 16, 12, 14, 11, 12, 14, 18, 11]])
B = A < 15
Print(B.astype(np.int8))
G) Basic Slicing in NumPy array
import numpy as np
# Arrange elements from 0 to 19
a = np.arange(20)
print("\n Array is:\n ",a)
print("\n a[15]=",a[15])
# a[start:stop:step]
print("\n a[-8:17:1] = ",a[-8:17:1])
print("\n a[10:] = ",a[10:])
H) Basic slicing with ellipses
import numpy as np
b = np.array([[[1, 2, 3],[4, 5, 6]], [[7, 8, 9],[10, 11, 12]]])
print(b[...,1])
I) Advanced Indexing
# Python program showing advanced indexing
import numpy as np
a = np.array([[1 ,2 ],[3 ,4 ],[5 ,6 ]])
print(a[[0 ,1 ,2 ],[0 ,0 ,1]])
J) Advanced Indexing
import numpy as np
b = np.array([[5, 5],[4, 5],[16, 4]])
sumrow = b.sum(-1)
print(b[sumrow%10==0])
K) Boolean Indexing
import numpy as np
a = np.array([1, 2, 3])
b = np.array([True, True, False])
c = a[b]
print(c)
OUTPUT
A) [[1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6], [1, 2, 3, 4, 5, 6]]
[1, 2, 3, 4, 5, 6, 10, 9, 8, 7, 6, 5]
[3, 4]
[1, 2, 3, 4]
[7, 6]
B)[10, 18, 24, 28, 30, 30]
C)[ True False False True False False]
D)[ True False True True True False]
E)[[ True True True True]
[ True True True False]
[ True True False False]]
F)[[1 1 1 1 0 1 1 1 1]
[1 1 1 0 0 0 1 1 1]
[1 1 1 0 1 0 1 1 1]
[1 1 0 0 1 0 0 1 1]
[1 1 0 1 1 1 0 1 1]
[1 0 0 1 1 1 0 0 1]
[1 0 1 1 1 1 1 0 1]
[1 0 1 1 1 1 1 0 1]
[1 0 1 1 1 1 1 0 1]
[1 0 0 0 0 0 0 0 1]
[1 0 1 1 1 1 1 0 1]
[1 0 1 1 1 1 1 0 1]
[1 0 1 1 1 1 1 0 1]
[1 0 1 1 1 1 1 0 1]
[1 0 1 1 1 1 1 0 1]]
G)Array is:
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19]
a[15]= 15
a[-8:17:1] = [12 13 14 15 16]
a[10:] = [10 11 12 13 14 15 16 17 18 19]
H)[[ 2 5]
[ 8 11]]
I)[1 3 6]
J)[[ 5 5]
[16 4]]
K)[1 2]
Experiment 2
AIM: Write a program to compute summary statistics such as mean, median,mode,
standard deviation and variance of the given different types of data.
A) Mean
numbers = [4, 10, 29,33,42,67]
def find_mean (list_of_numbers):
sum_n = sum (list_of_numbers)
len_n = len (list_of_numbers)
mean = sum_n/len_n
return mean
result = find_mean(numbers)
print(result)
B) Median
numbers_even = [4,10,29,33,42,67] # 6 elements
numbers_odd = [4,10,29,33,42,67,99] # 7 elements
def find_median(list_of_numbers):
list_of_numbers. sort ()
length = len(list_of_numbers)
length_is_odd = False if length % 2 == 0 else True
if length_is_odd:
index = length//2 + 1
median = list_of_numbers[index-1]
else:
index_1 = length//2
median = (list_of_numbers[index_1-1] + list_of_numbers[index_1]) / 2
return median
print(find_median(numbers_odd))
print(find_median(numbers_even))
C) Mode
def calculate_mode(list_of_numbers):
counter = {}
for i in list_of_numbers:
if i in counter:
counter[i] += 1
else:
counter[i] = 1
max_frequency = max(counter.values())
modes = [key for key, value in counter.items() if value == max_frequency]
return modes, max_frequency
n = [4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 9, 10, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
mode, frequency = calculate_mode(n)
print("Mode(s):", mode)
print("Frequency:", frequency)
OUTPUT
A)MEAN: 30.833333333333332
B)Median odd: 33
Median even: 31.0
C)Mode(s): [2]Frequency: 10
Experiment 3
AIM:Write a script named copyfile.py. This script should prompt the user for the
names of two text files. The contents of the first file should be the input that is to be
written to the second file.
SOURCE CODE:
file1 = input("Enter First Filename: ")
file2 = input("Enter Second Filename: ")
# open file in read mode
fn1 = open(file1, 'r')
# open other file in write mode
fn2 = open(file2, 'w')
# read the content of the file line by line
cont = fn1.readlines()
for i in range(0, len(cont)):
fn2.write(cont[i])
# close the file
fn2.close()
print("Content of first file copied to second file ")
# open file in read mode
fn2 = open(file2, 'r')
# read the content of the file
cont1 = fn2.read()
# print the content of the file
print("Content of Second file :")
print(cont1)
# close all files
fn1.close()
fn2.close()
OUTPUT
Enter First Filename: file1.txt
Enter Second Filename: file2.txt
Content of first file copied to second file
Content of Second file :
Hello
Good Morning
Welcome to Data Science Lab
EXPERIMENT-4

AIM: Write a program to implement the Naïve Bayesian classifier for a sample training
data set stored as a .CSV file. Compute the accuracy of the classifier, considering few test
data sets.
Source Code:
# import necessary libarities
import pandas as pd
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
# load data from CSV
data = pd.read_csv('tennisdata.csv')
print ("The first 5 values of data is:\n",data.head())

# obtain Train data and Train output


X = data.iloc[:,:-1]
print("\n The First 5 values of train data is\n",X.head())

y = data.iloc[:,-1]
print ("\n the first 5 values of Train output is\n",y.head())

# Convert then in numbers


le_outlook = LabelEncoder()
X.Outlook = le_outlook.fit_transform(X.Outlook)
le_Temperature = LabelEncoder()
X.Temperature = le_Temperature.fit_transform(X.Temperature)
le_Humidity = LabelEncoder()
X.Humidity = le_Humidity.fit_transform(X.Humidity)
le_Windy = LabelEncoder()
X.Windy = le_Windy.fit_transform(X.Windy)
print("\nNow the Train data is:\n",X.head())

le_PlayTennis = LabelEncoder()
y = le_PlayTennis.fit_transform(y)
print("\nNow the Train output is\n",y)

from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.20)
classifier = GaussianNB()
classifier.fit(X_train,y_train)
from sklearn.metrics import accuracy_score
print("Accuracy is:",accuracy_score(classifier.predict(X_test),y_test))

SAVE AS “tennisdata.csv” IN C:\Users\KGR\AppData\Local\Programs\Python\Python312


PATH

Outlook Temperatur Humidity Windy Play


e Golf

0 Rainy Hot High FALSE No

1 Rainy Hot High TRUE No

2 Overcast Hot High FALSE Yes


3 Sunny Mild High FALSE Yes

4 Sunny Cool Normal FALSE Yes

5 Sunny Cool Normal TRUE No

6 Overcast Cool Normal TRUE Yes

7 Rainy Mild High FALSE No

8 Rainy Cool Normal FALSE Yes

9 Sunny Mild Normal FALSE Yes

10 Rainy Mild Normal TRUE Yes

11 Overcast Mild High TRUE Yes

12 Overcast Hot Normal FALSE Yes

13 Sunny Mild High TRUE No


OUTPUT:
The first 5 values of data is:
Index Outlook Temperature Humidity Windy Play Golf
0 0 Rainy Hot High False No
1 1 Rainy Hot High True No
2 2 Overcast Hot High False Yes
3 3 Sunny Mild High False Yes
4 4 Sunny Cool Normal False Yes

The First 5 values of train data is


Index Outlook Temperature Humidity Windy
0 0 Rainy Hot High False
1 1 Rainy Hot High True
2 2 Overcast Hot High False
3 3 Sunny Mild High False
4 4 Sunny Cool Normal False

the first 5 values of Train output is


0 No
1 No
2 Yes
3 Yes
4 Yes
Name: Play Golf, dtype: object

Now the Train data is:


Index Outlook Temperature Humidity Windy
0 0 1 1 0 0
1 1 1 1 0 1
2 2 0 1 0 0
3 3 2 2 0 0
4 4 2 0 1 0

Now the Train output is


[0 0 1 1 1 0 1 0 1 1 1 1 1 0]
Accuracy is: 0.6666666666666666
EXPERIMENT-5

AIM: Write a program to demonstrate Regression analysis with residual plotson


a given data set.

# import packages and libraries


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

# reading the csv file


data = pd.read_csv(‘headbrain3.csv’)

# fit simple linear regression model


linear_model = ols(‘Brain_weight ~ Head_size’, data=data).fit()

# display model summary


print(linear_model.summary())

# modify figure size


fig = plt.figure(figsize=(14, 8))

# creating regression plots


fig = sm.graphics.plot_regress_exog(linear_model, ‘Head_size’, fig=fig)
plt.show()
OUTPUT
OLS Regression Results
====================================================================
Dep. Variable: Brain_weight R-squared: 0.516
Model: OLS Adj. R-squared: 0.511
Method: Least Squares F-statistic: 105.4
Date: Sat, 30 Mar 2024 Prob (F-statistic): 2.85e-17
Time: 10:02:05 Log-Likelihood: -580.70
No. Observations: 101 AIC: 1165.
Df Residuals: 99 BIC: 1171.
Df Model: 1
Covariance Type: nonrobust
====================================================================
coef std err t P>|t| [0.025 0.975]
---------------------------------------------------------------------------------------------------------------------
Intercept 437.4384 87.990 4.971 0.000 262.847 612.030
Head_size 0.2360 0.023 10.268 0.000 0.190 0.282
====================================================================
Omnibus: 1.314 Durbin-Watson: 2.023
Prob(Omnibus): 0.518 Jarque-Bera (JB): 1.019
Skew: 0.244 Prob(JB): 0.601
Kurtosis: 3.061 Cond. No. 4.41e+04
====================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 4.41e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
EXPERIMENT-6

AIM: Write a program to demonstrate the working of the decision tree-based ID3
algorithm. Use an appropriate data set for building the decision tree and apply this
knowledge to classify a new sample.
import pandas as pd
import numpy as np
import math

data = pd.read_csv("dataset.csv")
features = [feat for feat in data.columns if feat != "answer"]

class Node:
def __init__(self):
self.children = []
self.value = ""
self.isLeaf = False
self.pred = ""

def entropy(examples):
pos = sum(examples["answer"] == "yes")
neg = sum(examples["answer"] == "no")
total = len(examples)
if pos == 0 or neg == 0:
return 0.0
else:
p = pos / total
n = neg / total
return -(p * math.log2(p) + n * math.log2(n))
def info_gain(examples, attr):
uniq = np.unique(examples[attr])
gain = entropy(examples)
for u in uniq:
subdata = examples[examples[attr] == u]
sub_e = entropy(subdata)
gain -= (len(subdata) / len(examples)) * sub_e
return gain

def ID3(examples, attrs):


root = Node()
max_gain = 0
max_feat = ""
for feature in attrs:
gain = info_gain(examples, feature)
if gain > max_gain:
max_gain = gain
max_feat = feature
root.value = max_feat
uniq = np.unique(examples[max_feat])
for u in uniq:
subdata = examples[examples[max_feat] == u]
if entropy(subdata) == 0.0:
newNode = Node()
newNode.isLeaf = True
newNode.value = u
newNode.pred = np.unique(subdata["answer"])
root.children.append(newNode)
else:
dummyNode = Node()
dummyNode.value = u
new_attrs = attrs.copy()
new_attrs.remove(max_feat)
child = ID3(subdata, new_attrs)
dummyNode.children.append(child)
root.children.append(dummyNode)
return root

def printTree(root: Node, depth=0):


for i in range(depth):
print("\t", end="")
print(root.value, end="")
if root.isLeaf:
print(" -> ", root.pred)
print()
else:
print()
for child in root.children:
printTree(child, depth + 1)

def classify(root: Node, new):


for child in root.children:
if child.value == new[root.value]:
if child.isLeaf:
print("Predicted Label for new example", new, "is:", child.pred)
return
else:
classify(child.children[0], new)

root = ID3(data, features)


print("Decision Tree is:")
printTree(root)
print("------------------")
new = {"outlook": "sunny", "temperature": "hot", "humidity": "normal", "wind": "strong"}
classify(root, new)
OUTPUT
Decision Tree is:
outlook
overcast -> ['yes']

rain
wind
strong -> ['no']

weak -> ['yes']

sunny
humidity
high -> ['no']

normal -> ['yes']Predicted Label for new example {'outlook': 'sunny',


'temperature': 'hot', 'humidity': 'normal', 'wind': 'strong'} is: ['yes']
EXPERIMENT-7

AIM: Write a program to implement k-Nearest Neighbor algorithm to classify the


iris data set. Print both correct and wrong predictions using Java/Python ML
library classes.
Source Code:
#Python Program to Implement and Demonstrate KNN Algorithm
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'Class']

# Read dataset to pandas dataframe


dataset = pd.read_csv("9-dataset.csv", names=names)
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]
print(X.head())
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.10)

classifier = KNeighborsClassifier(n_neighbors=5).fit(Xtrain, ytrain)

ypred = classifier.predict(Xtest)

i=0
print ("\n-------------------------------------------------------------------------")
print ('%-25s %-25s %-25s' % ('Original Label', 'Predicted Label', 'Correct/Wrong'))
print ("-------------------------------------------------------------------------")
for label in ytest:
print ('%-25s %-25s' % (label, ypred[i]), end="")
if (label == ypred[i]):
print (' %-25s' % ('Correct'))
else:
print (' %-25s' % ('Wrong'))
i=i+1
print ("-------------------------------------------------------------------------")
print("\nConfusion Matrix:\n",metrics.confusion_matrix(ytest, ypred))
print ("-------------------------------------------------------------------------")
print("\nClassification Report:\n",metrics.classification_report(ytest, ypred))
print ("-------------------------------------------------------------------------")
print('Accuracy of the classifer is %0.2f' % metrics.accuracy_score(ytest,ypred))
print ("-------------------------------------------------------------------------")
OUTPUT
sepal-length sepal-width petal-length petal-width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
-------------------------------------------------------------------------
Original Label Predicted Label Correct/Wrong
-------------------------------------------------------------------------
Iris-versicolor Iris-versicolor Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-versicolor Iris-versicolor Correct
Iris-setosa Iris-setosa Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-virginica Correct
Iris-virginica Iris-versicolor Wrong
Iris-virginica Iris-virginica Correct
Iris-setosa Iris-setosa Correct
Iris-setosa Iris-setosa Correct
-------------------------------------------------------------------------
Confusion Matrix:
[[4 0 0]
[0 3 0]
[0 1 7]]
-------------------------------------------------------------------------
Classification Report:
precision recall f1-score support

Iris-setosa 1.00 1.00 1.00 4


Iris-versicolor 0.75 1.00 0.86 3
Iris-virginica 1.00 0.88 0.93 8

accuracy 0.93 15
macro avg 0.92 0.96 0.93 15
weighted avg 0.95 0.93 0.94 15
-------------------------------------------------------------------------
Accuracy of the classifer is 0.93
-------------------------------------------------------------------------
EXPERIMENT-8
AIM: Write a program to implement k-Means clustering algorithm to cluster the
set of data stored in .CSV file. Compare the results of various “k” values for the
quality of clustering.
Source Code:
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import sklearn.metrics as metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
names = ['Sepal_Length','Sepal_Width','Petal_Length','Petal_Width', 'Class']
dataset = pd.read_csv("9-dataset.csv", names=names)
X = dataset.iloc[:, :-1]
label = {'Iris-setosa': 0,'Iris-versicolor': 1, 'Iris-virginica': 2}
y = [label[c]for c in dataset.iloc[:, -1]]
plt.figure(figsize=(14,7))
colormap=np.array(['red','lime','black'])

# REAL PLOT
plt.subplot(1,3,1)
plt.title('Real')
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y])

# K-PLOT
model=KMeans(n_clusters=3, random_state=0).fit(X)
plt.subplot(1,3,2)
plt.title('KMeans')
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[model.labels_])
print('The accuracy score of K-Mean: ',metrics.accuracy_score(y, model.labels_))
print('The Confusion matrixof K-Mean:\n',metrics.confusion_matrix(y, model.labels_))
# GMM PLOT
gmm=GaussianMixture(n_components=3, random_state=0).fit(X)
y_cluster_gmm=gmm.predict(X)
plt.subplot(1,3,3)
plt.title('GMM Classification')
plt.scatter(X.Petal_Length,X.Petal_Width,c=colormap[y_cluster_gmm])
print('The accuracy score of EM: ',metrics.accuracy_score(y, y_cluster_gmm))
print('The Confusion matrix of EM:\n ',metrics.confusion_matrix(y, y_cluster_gmm))
plt.show()
OUTPUT:
The accuracy score of K-Mean: 0.24
The Confusion matrixof K-Mean:
[[ 0 50 0]
[47 0 3]
[14 0 36]]
The accuracy score of EM: 0.3333333333333333
The Confusion matrix of EM:
[[ 0 50 0]
[45 0 5]
[ 0 0 50]]
EXPERIMENT-9

AIM: Write a program to build Artificial Neural Network and test the same
using appropriate data sets.

SOURCE CODE:
import numpy as np
class NeuralNet(object):
def __init__(self):
# Generate random numbers
np.random.seed(1)
# Assign random weights to a 3 x 1 matrix,
self.synaptic_weights = 2 * np.random.random((3, 1)) - 1
# The Sigmoid function
def __sigmoid(self, x):
return 1 / (1 + np.exp(-x))
# The derivative of the Sigmoid function.
# This is the gradient of the Sigmoid curve.
def __sigmoid_derivative(self, x):
return x * (1 - x)
# Train the neural network and adjust the weights each time.
def train(self, inputs, outputs, training_iterations):
for iteration in range(training_iterations):
# Pass the training set through the network.
output = self.learn(inputs)
# Calculate the error
error = outputs - output
# Adjust the weights by a factor
factor = np.dot(inputs.T, error * self.__sigmoid_derivative(output))
self.synaptic_weights += factor

# The neural network thinks.


def learn(self, inputs):
return self.__sigmoid(np.dot(inputs, self.synaptic_weights))
if __name__ == "__main__":
# Initialize
neural_network = NeuralNet()
# The training set.
inputs = np.array([[0, 1, 1], [1, 0, 0], [1, 0, 1]])
outputs = np.array([[1, 0, 1]]).T
# Train the neural network
neural_network.train(inputs, outputs, 10000)
# Test the neural network with a test example.
print(neural_network.learn(np.array([1, 0, 1])))

OUTPUT:
[0.9897704]

You might also like