0% found this document useful (0 votes)
12 views30 pages

20BIT037 - Data Analytics

Data Analytics Synopsis and summary for engineering

Uploaded by

Anish Sil
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views30 pages

20BIT037 - Data Analytics

Data Analytics Synopsis and summary for engineering

Uploaded by

Anish Sil
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 30

Anish Sil

Name: Anish Sil


Roll no.: 20BIT037
Div: 1 (H2)
Subject: Big Data Analytics Lab

Data Analysis with Python

**Analyzing Numerical Data with NumPy**

import numpy as np

b = np.empty(2, dtype = int)

print("Matrix b: \n",b)

a = np.empty([2,2], dtype = int)

print("\nMatrix a: \n",a)

c = np.empty([3,3])

print("\nMatrix c: \n",c)

# 1) why matrices are not having entries as 0?

d = np.zeros(2, dtype = int)

print("\nMatrix d: \n",d)

e = np.zeros([2,2], dtype = int)

print("\nMatrix e: \n",e)

f = np.zeros([3,3])

print("\nMatrix f: \n",f)
Page 1 of 30
Anish Sil

print("\n Addition Example: ")

# Addition

# defining both the matrices

g = np.array([5, 72, 13, 100])

h = np.array([2, 5, 10, 30])

# performing addition using arithmetic operator

add_ans = g+h

print(add_ans)

# performing addition using numpy function

add_ans1 = np.add(g,h)

print(add_ans1)

# the same functions and operations can be used for

# multiple matrices

i = np.array([1, 2, 3, 4])

add_ans2 = g+h+i

print(add_ans2)

add_ans3 = np.add(g,h,i)

print(add_ans3)

print("\n Ans: 2")

# 2) add two matrices.

Page 2 of 30
Anish Sil
a1 = np.array([5, 10, 15, 20])

b1 = np.array([12, 24, 36, 48])

c1 = np.array([3, 6, 9, 12])

add2 = a1+b1

print(add2)

add2_1 = np.add(a1, b1)

print(add2_1)

print("\n Ans: 3")

# 3) add more than two array.

add3 = a1+b1+c1

print(add3)

add3_2 = np.add(add2_1, c1)

print(add3_2)

print("\n Subtraction Example: ")

# subtraction

# performing subtraction using arithmetic operator

sub_ans = g-h

print(sub_ans)

# performing subtraction using numpy function

sub_ans1 = np.subtract(g,h)
Page 3 of 30
Anish Sil
print(sub_ans1)

print("\n Ans: 4")

# 4) subtract two matrices.

sub2 = a1-b1

print(sub2)

sub2_1 = np.subtract(a1, b1)

print(sub2_1)

print("\n Ans: 5")

# 5) subtract more than two array.

sub3 = a1-b1-c1

print(sub3)

sub3_2 = np.subtract(sub2_1, c1)

print(sub3_2)

print("\n Multiplication Example: ")

# multiplication

# performing multiplication using arithmetic operator

mul_ans = g*h

print(mul_ans)

Page 4 of 30
Anish Sil
# performing multiplication using numpy function

mul_ans1 = np.multiply(g,h)

print(mul_ans1)

print("\n Ans: 6")

# 6) multiply two matrices.

mul2 = a1*b1

print(mul2)

mul2_1 = np.multiply(a1, b1)

print(mul2_1)

print("\n Ans: 7")

# 7) multiply more than two array.

mul3 = a1*b1*c1

print(mul3)

mul3_2 = np.multiply(mul2_1, c1)

print(mul3_2)

print("\n Division Example: ")

# division

# performing division using arithmetic operator

div_ans = g/h
Page 5 of 30
Anish Sil
print(div_ans)

# performing division using numpy function

div_ans1 = np.divide(g,h)

print(div_ans1)

print("\n Ans: 8")

# 8) division two matrices.

div2 = a1/b1

print(div2)

div2_1 = np.divide(a1, b1)

print(div2_1)

print("\n Ans: 9")

# 9) division more than two array.

div3 = a1/b1/c1

print(div3)

div3_2 = np.divide(div2_1, c1)

print(div3_2)

-------------------------------------------------------------------------------------

Matrix b:
[ 4 1072693248]

Matrix a:
[[1 0]
[0 1]]
Page 6 of 30
Anish Sil

Matrix c:
[[-3. 2. -6.]
[ 5. 7. -5.]
[ 1. 4. -2.]]

Matrix d:
[0 0]

Matrix e:
[[0 0]
[0 0]]

Matrix f:
[[0. 0. 0.]
[0. 0. 0.]
[0. 0. 0.]]

Addition Example:
[ 7 77 23 130]
[ 7 77 23 130]
[ 8 79 26 134]
[ 7 77 23 130]

Ans: 2
[17 34 51 68]
[17 34 51 68]

Ans: 3
[20 40 60 80]
[20 40 60 80]

Subtraction Example:
[ 3 67 3 70]
[ 3 67 3 70]

Ans: 4
[ -7 -14 -21 -28]
[ -7 -14 -21 -28]

Ans: 5
[-10 -20 -30 -40]
[-10 -20 -30 -40]

Multiplication Example:
[ 10 360 130 3000]
[ 10 360 130 3000]

Ans: 6
[ 60 240 540 960]
Page 7 of 30
Anish Sil
[ 60 240 540 960]

Ans: 7
[ 180 1440 4860 11520]
[ 180 1440 4860 11520]

Division Example:
[ 2.5 14.4 1.3 3.33333333]
[ 2.5 14.4 1.3 3.33333333]

Ans: 8
[0.41666667 0.41666667 0.41666667 0.41666667]
[0.41666667 0.41666667 0.41666667 0.41666667]

Ans: 9
[0.13888889 0.06944444 0.0462963 0.03472222]
[0.13888889 0.06944444 0.0462963 0.03472222]

-------------------------------------------------------------------------------------
**Python NumPy Array Indexing**

import numpy as np

# Psuedo Inverse

A = np.array([[1,4,3],[2,5,6]])

#A = np.array([[1,4],[3,2],[5,6]])

print(A)

print("\n")

B = np.transpose(A)

C = np.matmul(B,A)

E = np.linalg.det(C)

if(E==0):

print("Psuedo Inverse is not possible by (ATA)-1AT!")

print("\n")

else:

F = np.linalg.inv(C)

Page 8 of 30
Anish Sil
G = np.matmul(C, B)

print(G)

print("\n")

print("Psuedo Inverse: ")

L = np.linalg.pinv(A)

print(L)

print("\n")

H = np.matmul(A,B)

I = np.linalg.det(H)

if(I==0):

print("Psuedo Inverse is not possible by AT(AAT)-1!")

else:

J = np.linalg.inv(H)

K = np.matmul(B, J)

print(K)

print("\n")

print("Psuedo Inverse: ")

M = np.linalg.pinv(A)

print(M)

print("\n")

# Condition Number

print("Ans 9:")

#(a)

#A = np.array([[1,0],[0,1]])

#(b)
Page 9 of 30
Anish Sil
#A = np.array([[3,1],[5,2]])

#(c)

A = np.array([[1,0],[0,1]])

B = np.array([[1],[2]])

Y = np.array([[3],[4]])

C = np.subtract(Y, B)

D = np.matmul(np.linalg.inv(A), C)

print(D)

print("\n")

print("Ans 8:")

N = np.linalg.cond(A)

print(N)

print("\n")

print("Ans 10:")

A = np.array([[-3,2,-6],[5,7,-5],[1,4,-2]])

B = np.array([[6],[6],[8]])

x = np.linalg.solve(A,B)

print(x)

-----------------------------------------------------------------------------------------

[[1 4 3]
[2 5 6]]

Psuedo Inverse is not possible by (ATA)-1AT!

[[-0.16666667 0.13333333]
[ 0.66666667 -0.33333333]
[-0.5 0.4 ]]

Page 10 of 30
Anish Sil

Psuedo Inverse:
[[-0.16666667 0.13333333]
[ 0.66666667 -0.33333333]
[-0.5 0.4 ]]

Ans 9:
[[2.]
[2.]]

Ans 8:
1.0

Ans 10:
[[-2.]
[ 3.]
[ 1.]]

-------------------------------------------------------------------------------------
**NumPy Array Slicing**

# Python program to demonstrate

import numpy as np

# Create a sequence of integers from

# 10 to 1 with a step of -2

a = np.arange(10, 1, -2)

print("\n A sequential array with a negative step: \n",a)

# Indexes are specified inside the np.array method.

newarr = a[np.array([3, 1, 2 ])]

print("\n Elements at these indices are:\n",newarr)

# Python program for basic slicing.

# Arrange elements from 0 to 19

Page 11 of 30
Anish Sil
a = np.arange(20)

print("\n Array is:\n ",a)

# a[start:stop:step]

print("\n a[-8:17:1] = ",a[-8:17:1])

# The : operator means all elements till the end.

print("\n a[10:] = ",a[10:])

-----------------------------------------------------------------------------------------

A sequential array with a negative step:


[10 8 6 4 2]

Elements at these indices are:


[4 8 6]

Array is:
[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19]

a[-8:17:1] = [12 13 14 15 16]

a[10:] = [10 11 12 13 14 15 16 17 18 19]


-----------------------------------------------------------------------------------------

**NumPy Array Broadcasting**

macros = np.array([

[0.8, 2.9, 3.9],

[52.4, 23.6, 36.5],

[55.2, 31.7, 23.9],

[14.4, 11, 4.9]

])

# Create a new array filled with zeros, of the same shape as macros.

result = np.zeros_like(macros)

cal_per_macro = np.array([3, 3, 8])

Page 12 of 30
Anish Sil
# Now multiply each row of macros by

# cal_per_macro. In Numpy, `*` is

# element-wise multiplication between two arrays.

for i in range(macros.shape[0]):

result[i, :] = macros[i, :] * cal_per_macro

result

v = np.array([12, 24, 36])

w = np.array([45, 55])

# To compute an outer product we first reshape v to a column vector of shape 3x1

# then broadcast it against w to yield an output of shape 3x2 which is the outer product
of v and w

print("\n",np.reshape(v, (3, 1)) * w)

X = np.array([[12, 22, 33], [45, 55, 66]])

# x has shape 2x3 and v has shape (3, ) so they broadcast to 2x3,

print("\n",X + v)

# Add a vector to each column of a matrix X has shape 2x3 and w has shape (2, ) If we
transpose X

# then it has shape 3x2 and can be broadcast against w to yield a result of shape 3x2.

# Transposing this yields the final result of shape 2x3 which is the matrix.

print("\n",(X.T + w).T)

# Another solution is to reshape w to be a column vector of shape 2X1 we can then


broadcast it

# directly against X to produce the same output.

print("\n",X + np.reshape(w, (2, 1)))

Page 13 of 30
Anish Sil
# Multiply a matrix by a constant, X has shape 2x3. Numpy treats scalars as arrays of
shape();

# these can be broadcast together to shape 2x3.

print("\n",X * 2)

-------------------------------------------------------------------------------------

[[ 540 660]
[1080 1320]
[1620 1980]]

[[ 24 46 69]
[ 57 79 102]]

[[ 57 67 78]
[100 110 121]]

[[ 57 67 78]
[100 110 121]]

[[ 24 44 66]
[ 90 110 132]]

-------------------------------------------------------------------------------------
**Analyzing Data Using Pandas**

import pandas as pd

import numpy as np

# Creating empty series

ser = pd.Series()

print(ser)

# simple array

data = np.array(['d', 'a', 't', 'a', '1'])

ser = pd.Series(data)

print("\n",ser)

# Calling DataFrame constructor

df = pd.DataFrame()

print("\n",df)

Page 14 of 30
Anish Sil
# list of strings

lst = ['Big', 'Data', 'Analytics', 'and',

'Computing', 'for', 'ICT']

# Calling DataFrame constructor on list

df = pd.DataFrame(lst)

df

-------------------------------------------------------------------------------------

Series([], dtype: object)

0 d
1 a
2 t
3 a
4 1
dtype: object

Empty DataFrame
Columns: []
Index: []

-------------------------------------------------------------------------------------
**Creating Dataframe from CSV**

import pandas as pd

# Reading the CSV file

df = pd.read_csv("iris_csv.csv")

# Printing top 5 rows

Page 15 of 30
Anish Sil
df.head()

-------------------------------------------------------------------------------------

-------------------------------------------------------------------------------------
# applying filter function

df.filter(["Species", "SepalLengthCm",

"SepalLengthCm"]).head()

-------------------------------------------------------------------------------------

-------------------------------------------------------------------------------------
**Groups**

# Define a dictionary containing employee data

data1 = {'Name': ['Jai', 'Anuj', 'Jai', 'Princi', 'Gaurav', 'Anuj', 'Princi', 'Abhi'],

'Age': [27, 24, 22, 32, 33, 36, 27, 32],

'Address': ['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj', 'Jaunpur', 'Kanpur', 'Allahabad',


'Aligarh'],

'Qualification': ['Msc', 'MA', 'MCA', 'Phd', 'B.Tech', 'B.com', 'Msc', 'MA']}

# Convert the dictionary into DataFrame

df = pd.DataFrame(data1)
Page 16 of 30
Anish Sil
print("Original Dataframe")

display(df)

# applying groupby() function to group the data on Name value.

gk = df.groupby('Name')

# Let's print the first entries in all the groups formed.

print("After Creating Groups")

gk.first()

-------------------------------------------------------------------------------------

-------------------------------------------------------------------------------------
**Python Pandas Aggregation**
Page 17 of 30
Anish Sil
# Define a dictionary containing employee data

data1 = {'Name': ['Jai', 'Anuj', 'Jai', 'Princi', 'Gaurav', 'Anuj', 'Princi', 'Abhi'],

'Age': [27, 24, 22, 32, 33, 36, 27, 32],

'Address': ['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj', 'Jaunpur', 'Kanpur', 'Allahabad',


'Aligarh'],

'Qualification': ['Msc', 'MA', 'MCA', 'Phd', 'B.Tech', 'B.com', 'Msc', 'MA']}

# Convert the dictionary into DataFrame

df = pd.DataFrame(data1)

# performing aggregation using aggregate method

grp1 = df.groupby('Name')

grp1.aggregate(np.sum)

-------------------------------------------------------------------------------------

-------------------------------------------------------------------------------------
**Concatenating DataFrame**

# Define a dictionary containing employee data

data1 = {'key': ['K0', 'K1', 'K2', 'K3'],

'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],

'Age':[27, 24, 22, 32],}

# Define a dictionary containing employee data

data2 = {'key': ['K0', 'K1', 'K2', 'K3'],


Page 18 of 30
Anish Sil
'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],

'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']}

# Convert the dictionary into DataFrame

df = pd.DataFrame(data1)

# Convert the dictionary into DataFrame

df1 = pd.DataFrame(data2)

display(df, df1)

# combining series and dataframe

res = pd.concat([df, df1], axis=1)

res

-------------------------------------------------------------------------------------

Page 19 of 30
Anish Sil
-------------------------------------------------------------------------------------
**Merging DataFrame**

# Define a dictionary containing employee data

data1 = {'key': ['K0', 'K1', 'K2', 'K3'],

'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],

'Age':[27, 24, 22, 32],}

# Define a dictionary containing employee data

data2 = {'key': ['K0', 'K1', 'K2', 'K3'],

'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],

'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']}

# Convert the dictionary into DataFrame

df = pd.DataFrame(data1)

# Convert the dictionary into DataFrame

df1 = pd.DataFrame(data2)

display(df, df1)

# using .merge() function

res = pd.merge(df, df1, on='key')

res

-------------------------------------------------------------------------------------

Page 20 of 30
Anish Sil

------------------------------------------------------------------------------------
**Visualization with Matplotlib**

# Python program to show pyplot module

import matplotlib.pyplot as plt

plt.plot([1, 2, 3, 4], [1, 4, 9, 16])

plt.axis([0, 6, 0, 20])

plt.show()

-------------------------------------------------------------------------------------

Page 21 of 30
Anish Sil

-------------------------------------------------------------------------------------
import matplotlib.pyplot as plt

import pandas as pd

df = pd.read_csv("iris_csv.csv")

# This will plot a simple bar chart

plt.bar(df['class'], df['sepallength'])

# Title to the plot

plt.title("Iris Dataset")

# Adding the legends

plt.legend(["bar"])

plt.show()

-------------------------------------------------------------------------------------

Page 22 of 30
Anish Sil

-------------------------------------------------------------------------------------
import matplotlib.pyplot as plt

import pandas as pd

df = pd.read_csv("iris_csv.csv")

plt.hist(df["sepallength"])

# Title to the plot

plt.title("Histogram")

# Adding the legends

plt.legend(["sepallength"])

plt.show()

-------------------------------------------------------------------------------------

Page 23 of 30
Anish Sil

-------------------------------------------------------------------------------------
plt.scatter(df["class"], df["sepallength"])

# Title to the plot

plt.title("Scatter Plot")

# Adding the legends

plt.legend(["sepallength"])

plt.show()

-------------------------------------------------------------------------------------

Page 24 of 30
Anish Sil

-------------------------------------------------------------------------------------
plt.boxplot(df["sepallength"])

# Title to the plot

plt.title("Box Plot")

# Adding the legends

plt.legend(["sepallength"])

plt.show()

-------------------------------------------------------------------------------------

Page 25 of 30
Anish Sil
-------------------------------------------------------------------------------------

**Some other important handy functions**

df.shape

df.info()

# importing packages

import seaborn as sns

import matplotlib.pyplot as plt

sns.scatterplot(x='sepallength', y='sepalwidth',hue='class', data=df,)

# Placing Legend outside the Figure

plt.legend(bbox_to_anchor=(1, 1), loc=2)

plt.show()

df.isnull().sum()

data = df.drop_duplicates(subset ="class",)

data

df.value_counts("class")

-------------------------------------------------------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sepallength 150 non-null float64
1 sepalwidth 150 non-null float64
2 petallength 150 non-null float64
3 petalwidth 150 non-null float64
4 class 150 non-null object
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

Page 26 of 30
Anish Sil

class
Iris-setosa 50
Iris-versicolor 50
Iris-virginica 50
Name: count, dtype: int64

-------------------------------------------------------------------------------------
**Comparing Petal Length and Petal Width**

sns.scatterplot(x='petallength', y='petalwidth',hue='class', data=df,)

# Placing Legend outside the Figure

plt.legend(bbox_to_anchor=(1, 1), loc=2)

plt.show()

-------------------------------------------------------------------------------------

Page 27 of 30
Anish Sil
-------------------------------------------------------------------------------------
sns.pairplot(df.drop([], axis = 1), hue='class', height=2)

**Handling Outliers**

sns.boxplot(x='sepalwidth', data=df)

-------------------------------------------------------------------------------------

-------------------------------------------------------------------------------------
**Removing Outliers**

# Importing

import sklearn

# IQR

Q1 = np.percentile(df['sepalwidth'], 25,

interpolation = 'midpoint')

Q3 = np.percentile(df['sepalwidth'], 75,

interpolation = 'midpoint')

IQR = Q3 - Q1

Page 28 of 30
Anish Sil
print("Old Shape: ", df.shape)

# Upper bound

upper = np.where(df['sepalwidth'] >= (Q3+1.5*IQR))

# Lower bound

lower = np.where(df['sepalwidth'] <= (Q1-1.5*IQR))

-------------------------------------------------------------------------------------

-------------------------------------------------------------------------------------
# Removing the Outliers

df.drop(upper[0], inplace = True)

df.drop(lower[0], inplace = True)

print("New Shape: ", df.shape)

sns.boxplot(x='sepalwidth', data=df)

-------------------------------------------------------------------------------------

Page 29 of 30
Anish Sil

-------------------------------------------------------------------------------------

Page 30 of 30

You might also like