Data Analytics Lab Manual
Data Analytics Lab Manual
Course: - B. Tech
Year: 3rd /6th Sem
SUBMITTED TO SUBMITTED BY
Name: Ms.Kavya Goswami Name: Amitesh Pandey
Ms.Pooja Tomar
Designation :-Assistant Professor
Roll no: 2201920130025
INDEX
Execution Submission
S. No. Program Name Signature Remarks
Date Date
To get the input from the user and perform
numerical operations (MAX,MIN,
1 AVG,SUM, SQRT, ROUND) using
R/python.
To perform data import/export (.CSV, .XLS,
2 .TXT) operations using data frames in
R/Python
To get the input matrix from the user and
perform Matrix addition,subtraction,
3 multiplication, inverse transpose and division
operations using vector concept in Python.
To perform statistical operations (Mean,
4 Median, Mode and Standard
deviation) using Python.
To perform data pre-processing operations
5 i) Handling Missing data ii) Min-Max
normalization.
Objective: To get the input from the user and perform numerical operations
(MAX,MIN, AVG,SUM, SQRT, ROUND) using python.
Code:
import pandas as pd
import numpy as np
import seaborn as sns
df = sns.load_dataset('iris')
numeric=df.select_dtypes(include=[np.number]).columns.tolist()
col = "sepal_length"
data=df[col].values
#print heads
print(df.head())
print(numeric)
Output:
Program -2
Objective: To perform data import/export (.CSV, .XLS, .TXT) operations using data frames
in R/Python
Implementation:
Importing Data
1. Export to .CSV
df_csv.to_csv('output.csv', index=False)
2. Export to .XLSX
df_excel.to_excel('output.xlsx', index=False)
Program -3
Objective: To get the input matrix from the user and perform Matrix addition,subtraction,
multiplication, inverse transpose and division operations using vector concept in Python.
Code:
import numpy as np
print("\nMatrix B:")
B = get_matrix("Matrix B")
# Perform operations
print("\n--- Matrix Operations ---")
# Addition
if A.shape == B.shape:
print("\nAddition (A + B):")
print(A + B)
else:
print("\nAddition not possible (shape mismatch)")
# Subtraction
if A.shape == B.shape:
print("\nSubtraction (A - B):")
print(A - B)
else:
print("\nSubtraction not possible (shape mismatch)")
# Multiplication
if A.shape[1] == B.shape[0]:
print("\nMultiplication (A * B):")
print(np.dot(A, B))
else:
print("\nMultiplication not possible (columns of A != rows of B)")
# Transpose
print("\nTranspose of A:")
print(A.T)
print("\nTranspose of B:")
print(B.T)
# Inverse
try:
if A.shape[0] == A.shape[1]:
print("\nInverse of A:")
print(np.linalg.inv(A))
else:
print("\nInverse of A not possible (A is not square)")
if B.shape[0] == B.shape[1]:
print("\nInverse of B:")
print(np.linalg.inv(B))
else:
print("\nInverse of B not possible (B is not square)")
except np.linalg.LinAlgError:
print("\nInverse not possible (matrix is singular)")
# Division (A * inv(B))
try:
if B.shape[0] == B.shape[1] and A.shape[1] == B.shape[0]:
print("\nDivision (A * inv(B)):")
B_inv = np.linalg.inv(B)
print(np.dot(A, B_inv))
else:
print("\nDivision not possible (shape mismatch or B not square)")
except np.linalg.LinAlgError:
print("\nDivision not possible (B is singular)")
Output:
Program - 4
Code:
import csv
import statistics
def read_data_from_csv(file_path):
data = []
try:
with open(file_path, mode='r') as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
try:
value = float(row['value']) # Ensure numeric data
data.append(value)
except ValueError:
print(f"Skipping invalid data: {row['value']}")
except FileNotFoundError:
print(f"File not found: {file_path}")
return data
def compute_statistics(data):
if not data:
print("No valid data to compute statistics.")
return
try:
mean = statistics.mean(data)
median = statistics.median(data)
mode = statistics.mode(data)
std_dev = statistics.stdev(data)
print(f"\nStatistics Summary:")
print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")
print(f"Standard Deviation: {std_dev:.2f}")
except statistics.StatisticsError as e:
print(f"Statistics Error: {e}")
def main():
file_path = 'data.csv'
data = read_data_from_csv(file_path)
compute_statistics(data)
if __name__ == "__main__":
main()
Output:
Program - 5
Objective: To perform data pre-processing operations i) Handling Missing data ii) Min-Max
normalization.
Code:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def main():
file_path = 'data_preprocessing.csv'
try:
df = pd.read_csv(file_path)
print("Original Data:\n", df)
if __name__ == "__main__":
main()
Output:
Program - 6
Objective: To perform dimensionality reduction operation using PCA for Houses Data Set.
Code:
import pandas as pd
import matplotlib.pyplot as plt
plt.show()
# Step 6: Print explained variance
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
Output:
Program - 7
Code:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
sd=pd.read_csv("salary_data.csv")
wd.head()
sd.columns
x=sd['YearsExperience']
y=sd['Salary']
theta_0=lr_model.intercept_
theta_1=lr_model.coef_
theta_0,theta_1
Objective: To perform K-Means clustering operation and visualize for iris data set.
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
plt.style.use('seaborn')
fig, ax = plt.subplots(figsize=(10,7))
plt.scatter(x=clusters[clusters['Cluster_Prediction'] == 0]['SepalLengthCm'],
y=clusters[clusters['Cluster_Prediction'] == 0]['SepalWidthCm'],
s=70,edgecolor='lime', linewidth=0.3, c='lime', label='Iris-versicolor')
plt.scatter(x=clusters[clusters['Cluster_Prediction'] == 1]['SepalLengthCm'],
y=clusters[clusters['Cluster_Prediction'] == 1]['SepalWidthCm'],
s=70,edgecolor='teal', linewidth=0.3, c='teal', label='Iris-setosa')
plt.scatter(x=clusters[clusters['Cluster_Prediction'] == 2]['SepalLengthCm'],
y=clusters[clusters['Cluster_Prediction'] == 2]['SepalWidthCm'],
s=70,edgecolor='magenta', linewidth=0.3, c='magenta', label='Iris-virginica')
plt.scatter(x=kms.cluster_centers_[:, 0], y=kms.cluster_centers_[:, 1], s = 170,
c = 'red', label = 'Centroids',edgecolor='black', linewidth=0.3)
plt.legend(loc='upper right')
plt.xlim(4,8)
plt.ylim(1.8,4.5)
ax.set_ylabel('Sepal Width (in cm)')
ax.set_xlabel('Sepal Length (in cm)')
plt.title('Clusters', fontsize = 20)
plt.show()
Output:
Program - 9
Objective: Write a Python script to diagnose any disease using KNN classification and plot
the results.
Code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
data = pd.read_csv('heart.csv')
data.shape
data.head()
sns.countplot(x="target", data=data, palette="bwr")
plt.show()
sns.countplot(x='sex', data=data, palette="mako_r")
plt.xlabel("Sex (0 = female, 1= male)")
plt.show()
plt.scatter(x=data.age[data.target==1], y=data.thalach[(data.target==1)], c="red")
plt.scatter(x=data.age[data.target==0], y=data.thalach[(data.target==0)], c = 'black')
plt.legend(["Disease", "Not Disease"])
plt.xlabel("Age")
plt.ylabel("Maximum Heart Rate")
plt.show()
X = data.iloc[:,:-1].values
y = data.iloc[:,13].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_stat
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier = classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
#check accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
Accuracy: 0.88
#k=6
classifier = KNeighborsClassifier(n_neighbors = 6, metric = 'minkowski', p = 2)
classifier = classifier.fit(X_train,y_train)
#prediction
y_pred = classifier.predict(X_test)
#check accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
Accuracy: 0.88
#k=7
classifier = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2)
classifier = classifier.fit(X_train,y_train)
#prediction
y_pred = classifier.predict(X_test)
#check accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
Accuracy: 0.86
#k=8
classifier = KNeighborsClassifier(n_neighbors = 8, metric = 'minkowski', p = 2)
classifier = classifier.fit(X_train,y_train)
#prediction
y_pred = classifier.predict(X_test)
#check accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
Accuracy: 0.89
Conclusion: Diagnose heart disease using KNN classification and plotted the results.
Program - 10
Code:
import pandas as pd
import numpy as np
from apyori import apriori
store_data=pd.read_csv("Market_Basket_Optimisation.csv",header=None)
num_record=len(store_data)
print(num_record)
7501
records=[]
for i in range(0,num_record):
records.append([str(store_data.values[i,j])for j in range(0,20)])
association_rules= apriori(records,min_support=0.0053,
min_confidence=0.20,min_lift=3,min_length=2)
association_results=list(association_rules)
print(len(association_results))
32
print(association_results[0])
results=[]
for item in association_results:
pair=item[0]
items=[x for x in pair]
value0=str(items[0])
value1=str(items[1])
value2=str(item[1])[:7]
value3=str(item[2][0][2])[:7]
value4=str(item[2][0][3])[:7]
rows=(value0,value1,value2,value3,value4)
results.append(rows)
Label=['Item1','Item2','Support_count','Confidence','Lift']
store_suggestions=pd.DataFrame.from_records(results,columns=Label)
print(store_suggestions)
Output:
Program - 11
Code:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
df = pandas.read_csv("Student_Performance.csv")
df.columns
Output: