0% found this document useful (0 votes)
3 views24 pages

DSC Lab Programs

The document contains various Python programs demonstrating the use of NumPy and Pandas for data manipulation and analysis. It includes examples of array operations, statistical functions, creating and manipulating DataFrames, handling missing values, and visualizing data using plots. The content is structured as a series of code snippets, each illustrating specific functionalities and methods within the NumPy and Pandas libraries.

Uploaded by

abjayak32
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
3 views24 pages

DSC Lab Programs

The document contains various Python programs demonstrating the use of NumPy and Pandas for data manipulation and analysis. It includes examples of array operations, statistical functions, creating and manipulating DataFrames, handling missing values, and visualizing data using plots. The content is structured as a series of code snippets, each illustrating specific functionalities and methods within the NumPy and Pandas libraries.

Uploaded by

abjayak32
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 24

2.

Working with NumPy arrays

# pgm 1
import numpy as np

matrix1 = [
[3, 4, 2],
[5, 1, 8],
[3, 1, 9]
]

matrix2 = [
[3, 7, 5],
[2, 9, 8],
[1, 5, 8]
]

result = np.dot(matrix1, matrix2)


print(result)

# pgm 2
import numpy as np

n = 8

# Create a nxn matrix filled with 0


matrix = np.zeros((n, n), dtype=int)

# fill 1 with alternate rows and column


matrix[::2, 1::2] = 1
matrix[1::2, ::2] = 1

# Print the checkerboard pattern


for i in range(n):
for j in range(n):
print(matrix[i][j], end=" ")
print()

# pgm 3
import numpy as np

# Creating 5x4 array


array = np.arange(20).reshape(5, 4)
print(array)
print()

# If no axis mentioned, then it works on the entire array


print(np.argmax(array))

# If axis=1, then it works on each row


print(np.argmax(array, axis=1))

# If axis=0, then it works on each column


print(np.argmax(array, axis=0))

# pgm 4
import numpy as np
array = np.array([
[3, 7, 1],
[10, 3, 2],
[5, 6, 7]
])
print(array)
print()

# Sort the whole array


print(np.sort(array, axis=None))

# Sort along each row


print(np.sort(array, axis=1))

# Sort along each column


print(np.sort(array, axis=0))

# pgm 5
import numpy as np

array = np.array([28, 13, 45, 12, 4, 8, 0])


print(array)

print(np.argsort(array))

reversedArray = np.flipud(array)
print(reversedArray)

# pgm 6
import numpy as np

list = [
np.array([3, 2, 8, 9]),
np.array([4, 12, 34, 25, 78]),
np.array([23, 12, 67])
]

result = []
for i in range(len(list)):
result.append(np.mean(list[i]))
print(result)

# pgm 7
import numpy as np

the_array = np.array([])
is_empty = the_array.size == 0
print(is_empty)

the_array = np.array([1, 2, 3])


is_empty = the_array.size == 0
print(is_empty)

# pgm 8
import numpy as np

array1 = np.array([[1, 2, 3], [4, 5, 6]])


array2 = np.array([[7, 8, 9], [10, 11, 12]])
print(array1 + array2)
print("-" * 20)

print(array1 - array2)
print("-" * 20)

print(array1 * array2)
print("-" * 20)

print(array2 / array1)
print("-" * 40)

print(array1 ** array2)
print("-" * 40)

# pgm 9
#NumPy Aggregate and Statistical Functions
import numpy as np

array1 = np.array([[10, 20, 30], [40, 50, 60]])

print("Mean: ", np.mean(array1))

print("Std: ", np.std(array1))

print("Var: ", np.var(array1))

print("Sum: ", np.sum(array1))

print("Prod: ", np.prod(array1))

# pgm 10
#How to print a NumPy array without scientific notation in Python?Suppress
Scientific Notation
import numpy as np

np.set_printoptions(suppress=True,formatter={'float_kind': '{:f}'.format})

the_array = np.array([3.74, 5162, 13683628846.64, 12783387559.86, 1.81])


print(the_array)

# pgm 11
#Sum of all elements
import numpy as np

arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

newarr = arr.reshape(4, 3)
column_sums = newarr[:, :].sum()
print(column_sums)

# pgm 12
#How do you find the mean across a column in Python?Calculate mean values across a
column
import numpy as np

the_array = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])


mean_array = the_array.mean(axis=0)
print(mean_array)

mean_array = the_array.mean(axis=1)
print(mean_array)

mean_array = the_array[:, 0].mean()


print(mean_array)

# pgm 13
#How do you convert a one dimensional array to a two dimensional array in Python?4
Rows with 2 Cols
import numpy as np

arr = np.array([1, 2, 3, 4, 5, 6, 7, 8])

newarr = arr.reshape(4, 2)
print(newarr)

newarr = arr.reshape(2, 4)
print(newarr)

newarr = np.reshape(arr, (-1, 2))


print(newarr)

newarr = np.reshape(arr, (1, arr.size))


print(newarr)

newarr = np.reshape(arr, (-1, 4))


print(newarr)

# pgm 14 maclaurin series

from math import e, factorial

import numpy as np

fac = np.vectorize(factorial)

def e_x(x, terms=10):


"""Approximates e^x using a given number of terms of
the Maclaurin series
"""
n = np.arange(terms)
return np.sum((x ** n) / fac(n))

if __name__ == "__main__":
print("Actual:", e ** 3) # Using e from the standard library

print("N (terms)\tMaclaurin\tError")

for n in range(1, 14):


maclaurin = e_x(3, terms=n)
print(f"{n}\t\t{maclaurin:.03f}\t\t{e ** 3 - maclaurin:.03f}")

# pgm 15

#How to multiply each element of Numpy array in Python?


import numpy as np

the_array = np.array([[1, 2, 3], [1, 2, 3]])

prod = np.prod(the_array)
print(prod)

prod = np.prod(the_array, 0)
print(prod)

prod = np.prod(the_array, 1)
print(prod)

the_array = np.array([1, 2, 3])

prod = np.prod(the_array)
print(prod)

3. Working with Pandas data frames

# pgm 1
# import pandas as pd
import pandas as pd

# list of strings
lst = ['Good', 'Morning', 'Have', 'a',
'nice', 'day', 'Welcome']

# Calling DataFrame constructor on list


df = pd.DataFrame(lst)
print(df)

# pgm 2
# Python code demonstrate creating
# DataFrame from dict narray / lists
# By default addresses.

import pandas as pd

# intialise data of lists.


data = {'Name': ['Tom', 'nick', 'krish', 'jack'],
'Age': [20, 21, 19, 18]}

# Create DataFrame
df = pd.DataFrame(data)

# Print the output.


print(df)

# pgm 3
# Import pandas package
import pandas as pd

# Define a dictionary containing employee data


data = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
'Age': [27, 24, 22, 32],
'Address': ['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'],
'Qualification': ['Msc', 'MA', 'MCA', 'Phd']}

# Convert the dictionary into DataFrame


df = pd.DataFrame(data)

# select two columns


print(df[['Name', 'Qualification']])

# pgm 4

# importing pandas package


import pandas as pd

# making data frame from csv file


data = pd.read_csv("D:/edith esther/CS3361 DSC LAB/nba.csv", index_col="Name")

# retrieving row by loc method


first = data.loc["Avery Bradley"]
second = data.loc["R.J. Hunter"]

print(first, "\n\n\n", second)

# pgm 5

# importing pandas package


import pandas as pd

# making data frame from csv file


data = pd.read_csv("D:/edith esther/CS3361 DSC LAB/nba.csv", index_col="Name")

# retrieving columns by indexing operator


first = data["Age"]

print(first)

# pgm 6

# importing pandas package


import pandas as pd

# making data frame from csv file


data = pd.read_csv("D:/edith esther/CS3361 DSC LAB/nba.csv", index_col="Name")

# retrieving row by loc method


first = data.loc["Avery Bradley"]
second = data.loc["R.J. Hunter"]

print(first, "\n\n\n", second)

# pgm 7
import pandas as pd

# making data frame from csv file


data = pd.read_csv("D:/edith esther/CS3361 DSC LAB/nba.csv", index_col="Name")

# retrieving rows by iloc method


row2 = data.iloc[3]
print(row2)

# pgm 8
# Pandas Series in order to find null values in a series.
# importing pandas as pd
import pandas as pd

# importing numpy as np
import numpy as np

# dictionary of lists
dict = {'First Score': [100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score': [np.nan, 40, 80, 98]}

# creating a dataframe from list


df = pd.DataFrame(dict)

# using isnull() function


print(df.isnull())

# pgm 9

# importing pandas as pd
import pandas as pd

# importing numpy as np
import numpy as np

# dictionary of lists
dict = {'First Score': [100, 90, np.nan, 95],
'Second Score': [30, 45, 56, np.nan],
'Third Score': [np.nan, 40, 80, 98]}

# creating a dataframe from dictionary


df = pd.DataFrame(dict)

# filling missing value using fillna()


print(df.fillna(0))

# pgm 10

# Now we drop rows with at least one Nan value (Null value)
# importing pandas as pd
import pandas as pd

# importing numpy as np
import numpy as np

# dictionary of lists
dict = {'First Score': [100, 90, np.nan, 95],
'Second Score': [30, np.nan, 45, 56],
'Third Score': [52, 40, 80, 98],
'Fourth Score': [np.nan, np.nan, np.nan, 65]}

# creating a dataframe from dictionary


df = pd.DataFrame(dict)
# using dropna() function
print(df.dropna())

# pgm 11

# n order to iterate over rows, we can use three function iteritems(), iterrows(),
itertuples() .
#These three function will help in iteration over rows.

# importing pandas as pd
import pandas as pd

# dictionary of lists
dict = {'name': ["aparna", "pankaj", "sudhir", "Geeku"],
'degree': ["MBA", "BCA", "M.Tech", "MBA"],
'score': [90, 40, 80, 98]}

# creating a dataframe from a dictionary


df = pd.DataFrame(dict)

print(df)

# pgm 12
# Now we apply iterrows() function in order to get a each element of rows.

# importing pandas as pd
import pandas as pd

# dictionary of lists
dict = {'name': ["aparna", "pankaj", "sudhir", "Geeku"],
'degree': ["MBA", "BCA", "M.Tech", "MBA"],
'score': [90, 40, 80, 98]}

# creating a dataframe from a dictionary


df = pd.DataFrame(dict)

# iterating over rows using iterrows() function


for i, j in df.iterrows():
print(i, j)
print()

# pgm 13

# In order to iterate over columns, we need to create a list of dataframe columns


and then iterating through that list to pull out the dataframe columns.

# importing pandas as pd
import pandas as pd

# dictionary of lists
dict = {'name': ["aparna", "pankaj", "sudhir", "Geeku"],
'degree': ["MBA", "BCA", "M.Tech", "MBA"],
'score': [90, 40, 80, 98]}

# creating a dataframe from a dictionary


df = pd.DataFrame(dict)

print(df)
4 Reading data from text files, Excel and the web and exploring various
commands for doing descriptive analytics on the Iris data set

print("Iris dataset example project")


print("Importing pandas to use in our code as pd.")
import pandas as pd

print('Reading the dataset “Iris.csv”')


data = pd.read_csv("D:/CSE/dataset/iris.csv")

print("Displaying up the top rows of the dataset with their columns")


print(data.head())

print("Displaying the number of rows randomly")


print(data.sample(10))

print("Displaying the number of columns and names of the columns.")


print(data.columns)

print("Displaying the shape of the dataset")


print(data.shape)

print("Slicing the rows")


print(data[10:21])
sliced_data = data[10:21]
print(sliced_data)

print("Displaying only specific columns.")


specific_data = data[["Id", "Species"]]
print(specific_data.head(10))

print("Displaying the specific rows using “iloc” and “loc” functions")


print(data.iloc[5])
print(data.loc[data["Species"] == "Iris-setosa"])

print("Counting the number of counts of unique values using 'value_counts()'")


print(data["Species"].value_counts())

print("Calculating sum, mean and mode of a particular column.")


sum_data = data["SepalLengthCm"].sum()
mean_data = data["SepalLengthCm"].mean()
median_data = data["SepalLengthCm"].median()
print("Sum:", sum_data, "\nMean:", mean_data, "\nMedian:", median_data)

print("Extracting minimum and maximum from a column")


min_data = data["SepalLengthCm"].min()
max_data = data["SepalLengthCm"].max()
print("Minimum:", min_data, "\nMaximum:", max_data)

print("Adding a column to the dataset.")


cols = data.columns
print(cols)
data1 = data[cols]
data["total_values"]=data1[cols].sum(axis=1,numeric_only = True)
print("Cleaning and detecting missing values")
print(data.isnull())

print("Pandas Dataframe Correlation")


print(data.corr(method='pearson',numeric_only = True))
print(data.corr(numeric_only = True))

print("describe")
print(data.describe())

print("create another column in the data frame with number for different species")
Target = []
for i in range(len(data['Species'])):
if data['Species'][i] == "Iris-setosa":
Target.append("1")
elif data['Species'][i] == 'Iris-versicolor':
Target.append("2")
else:
Target.append('3')
data['Target'] = Target
print(data.to_string)

print(" Group the dataset based on species ")


print(data.groupby('Species').size())

import matplotlib.pyplot as plt

print("draw an area plot for only one variable in this dataset")


data['PetalLengthCm'].plot.area()
plt.show()

print("draw the area plots of all columns")


data.plot.area()
plt.show()

print("draw a scatterplot")
data.plot.scatter(x='PetalLengthCm', y='PetalWidthCm')
plt.show()

print(" plot the petal length and width using hexagonal binning ")
data.plot.hexbin(x="PetalLengthCm", y="PetalWidthCm", gridsize=25)
plt.show()

print(" plot the average of petal length ")


iris_avg=data["PetalLengthCm"].groupby(data["Species"]).mean()
print("petallength average")
print(iris_avg)
iris_avg.plot.pie()
plt.show()

print(" plot the average of petal length and width ")


iris_avg_2=data[["PetalWidthCm","PetalLengthCm"]].groupby(data["Species"]).mean()
print("average of petal length and width")
print(iris_avg_2)
iris_avg_2.plot.pie(subplots=True)
plt.show()

print(" Plot scatterplots for all three species ")


setosa = data[data.Species == "Iris-setosa"]
versicolor = data[data.Species=='Iris-versicolor']
virginica = data[data.Species=='Iris-virginica']
fig, ax = plt.subplots()
fig.set_size_inches(13, 7) # adjusting the length and width of plot

print("lables and scatter points")


ax.scatter(setosa['PetalLengthCm'], setosa['PetalWidthCm'], label="Iris-Setosa",
facecolor="blue")
ax.scatter(versicolor['PetalLengthCm'], versicolor['PetalWidthCm'], label="Iris-
Versicolor", facecolor="green")
ax.scatter(virginica['PetalLengthCm'], virginica['PetalWidthCm'], label="Iris-
Virginica", facecolor="red")
ax.set_xlabel("PetalLengthCm")
ax.set_ylabel("PetalWidthCm")
ax.grid()
plt.show()
ax.set_title("Iris petals")
ax.legend()
plt.show()

5 Use the diabetes data set from UCI and Pima Indians Diabetes data set for
performing the following:
a. Univariate analysis: Frequency, Mean, Median, Mode, Variance, Standard
Deviation, Skewness and Kurtosis.
b. Bivariate analysis: Linear and logistic regression modelling
c. Multiple Regression analysis
d. Also compare the results of the above analysis for the two data sets

i. PIMA INDIAN DIABETES dataset

print("PIMA diabetes dataset")


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Reading the dataset


diab=pd.read_csv("D:/CSE/dataset/pimadataset.csv")

print("printing top rows")


print(diab.head())

print("Displaying the number of rows randomly")


print(diab.sample(10))

print("Displaying the number of columns and names of the columns.")


print(diab.columns)

print("Displaying the shape of the dataset")


print(diab.shape)

print("Finding if there are any null and Zero values in the data set")
print(diab.isnull().values.any())

print("To run numerical descriptive stats for the data set")


print(diab.describe())
print("find out how many Zero values are included in each variable")
print((diab.Pregnancies == 0).sum(),(diab.Glucose==0).sum(),
(diab.BloodPressure==0).sum(),
(diab.SkinThickness==0).sum(),(diab.Insulin==0).sum(),(diab.BMI==0).sum(),
(diab.DiabetesPedigreeFunction==0).sum(),(diab.Age==0).sum())

print("we’ll drop 0 values and create a our new dataset which can be used for
further analysis")
'''Creating a dataset called 'dia' from original dataset 'diab' with excludes all
rows with have
zeros only for Glucose,BP, Skinthickness, Insulin and BMI, as other columns can
contain
Zero values'''
drop_Glu=diab.index[diab.Glucose == 0].tolist()
drop_BP=diab.index[diab.BloodPressure == 0].tolist()
drop_Skin = diab.index[diab.SkinThickness==0].tolist()
drop_Ins = diab.index[diab.Insulin==0].tolist()
drop_BMI = diab.index[diab.BMI==0].tolist()
c=drop_Glu+drop_BP+drop_Skin+drop_Ins+drop_BMI
dia=diab.drop(diab.index[c])
print(dia.info())
print(dia.describe())

dia1 = dia[dia.Outcome==1]
dia0 = dia[dia.Outcome==0]
print(dia0)

# Computing the %age of diabetic and non-diabetic in the sample


Out0=len(dia[dia.Outcome==1])
Out1=len(dia[dia.Outcome==0])
Total=Out0+Out1
PC_of_1 = Out1*100/Total
PC_of_0 = Out0*100/Total
print(PC_of_1, PC_of_0)

cor = dia.corr(method ='pearson')


print(cor)

print("Counting the number of counts of unique values using 'value_counts()'")


print(diab["Outcome"].value_counts())

print("Calculating sum, mean and mode of a particular column.")


sum_data = diab["BMI"].sum()
mean_data = diab["BMI"].mean()
median_data = diab["BMI"].median()
mode_data = diab["BMI"].mode()
print("Sum:", sum_data, "\nMean:", mean_data, "\nMedian:", median_data,"\
nMode:",mode_data)

print("Calculating variance and standard deviation")


var_data = diab["BMI"].var()
std_data = diab["BMI"].std()
print("Variance:", var_data, "\nStandard Deviation:", std_data)

print("Extracting minimum and maximum from a column")


min_data = diab["BMI"].min()
max_data = diab["BMI"].max()
print("Minimum:", min_data, "\nMaximum:", max_data)

print("Cleaning and detecting missing values")


print(diab.isnull())
print("create another column in the data frame with number for different species")
Target = []
for i in range(len(diab['Outcome'])):
if diab['Outcome'][i] == 0:
Target.append("Negative")
else:
Target.append('Positive')
diab['Target'] = Target
print(diab.to_string)

print(" Group the dataset based on species ")


print(diab.groupby('Target').size())

# skip the na values


print("find skewness in each row")
print("Use skew() function to find the skewness of the data over the column axis.")
skew1=diab.skew(axis = 1, skipna = True,numeric_only = True)
print("\nSkew value in axis =1:",skew1)

print("skewness along the index axis")


skew0=diab.skew(axis = 0, skipna = True,numeric_only = True)
print("\nSkew value in axis =0:",skew0)

print("\nkurtosis value in axis =0")


print("Pearson's Kurtosis and Fisher's Kurtosis")
kurt0=diab.kurt(axis=0,numeric_only = True)
print(kurt0)

print("\nkurtosis value in axis =1")


print("Pearson's Kurtosis and Fisher's Kurtosis")
kurt1=diab.kurt(axis=1,numeric_only = True)
print(kurt1)

print("calculate Regression")
import numpy as np
import matplotlib.pyplot as plt

def estimate_coef(x, y):


# number of observations/points
n = np.size(x)

# mean of x and y vector


m_x = np.mean(x)
m_y = np.mean(y)

# calculating cross-deviation and deviation about x


SS_xy = np.sum(y * x) - n * m_y * m_x
SS_xx = np.sum(x * x) - n * m_x * m_x

# calculating regression coefficients


b_1 = SS_xy / SS_xx
b_0 = m_y - b_1 * m_x
return (b_0, b_1)

def plot_regression_line(x, y, b):


# plotting the actual points as scatter plot
plt.scatter(x, y, color="m",
marker="o", s=30)

# predicted response vector


y_pred = b[0] + b[1] * x

# plotting the regression line


plt.plot(x, y_pred, color="g")

# putting labels
plt.xlabel('x')
plt.ylabel('y')

# function to show plot


plt.show()

def main():
# observations / data
x = diab["Glucose"]
y = diab["Age"]

# estimating coefficients
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {} \
\nb_1 = {}".format(b[0], b[1]))

# plotting regression line


plot_regression_line(x, y, b)

if __name__ == "__main__":
main()

ii. UC Irwin Diabetes Dataset

print("UC Irwin diabetes dataset")


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Reading the dataset


diab=pd.read_csv("D:/edith esther/CS3361 DSC LAB/uci_diabetes_use.csv")

print("printing original table")


print(diab.to_string())

# Create copy of pandas DataFrame


data1_bool = diab.copy()
# Replace string by boolean
data1_bool['Gender'] = data1_bool['Gender'].map({'Male': 1, 'Female': 0})
data1_bool['Polyuria'] = data1_bool['Polyuria'].map({'Yes': 1, 'No': 0})
data1_bool['Polydipsia'] = data1_bool['Polydipsia'].map({'Yes': 1, 'No': 0})
data1_bool['sudden weight loss'] = data1_bool['sudden weight loss'].map({'Yes': 1,
'No': 0})
data1_bool['weakness'] = data1_bool['weakness'].map({'Yes': 1, 'No': 0})
data1_bool['Polyphagia'] = data1_bool['Polyphagia'].map({'Yes': 1, 'No': 0})
data1_bool['Genital thrush'] = data1_bool['Genital thrush'].map({'Yes': 1, 'No':
0})
data1_bool['visual blurring'] = data1_bool['visual blurring'].map({'Yes': 1, 'No':
0})
data1_bool['Itching'] = data1_bool['Itching'].map({'Yes': 1, 'No': 0})
data1_bool['Irritability'] = data1_bool['Irritability'].map({'Yes': 1, 'No': 0})
data1_bool['delayed healing'] = data1_bool['delayed healing'].map({'Yes': 1, 'No':
0})
data1_bool['partial paresis'] = data1_bool['partial paresis'].map({'Yes': 1, 'No':
0})
data1_bool['muscle stiffness'] = data1_bool['muscle stiffness'].map({'Yes': 1,
'No': 0})
data1_bool['Alopecia'] = data1_bool['Alopecia'].map({'Yes': 1, 'No': 0})
data1_bool['Obesity'] = data1_bool['Obesity'].map({'Yes': 1, 'No': 0})
data1_bool['class'] = data1_bool['class'].map({'Positive': 1, 'Negative': 0})
print(data1_bool.to_string())

cor = data1_bool.corr(method ='pearson')


print(cor)

print("correlation between Age and class")


print(data1_bool['Age'].corr(data1_bool['class']))

print("Counting the number of counts of unique values using 'value_counts()'")


print(data1_bool["class"].value_counts())

# skip the na values


print("find skewness in each row")
print("Use skew() function to find the skewness of the data over the column axis.")
skew1=data1_bool.skew(axis = 1, skipna = True)
print("\nSkew value in axis =1:",skew1)

print("skewness along the index axis")


skew0=data1_bool.skew(axis = 0, skipna = True)
print("\nSkew value in axis =0:",skew0)

print("\nkurtosis value in axis =0")


print("Pearson's Kurtosis and Fisher's Kurtosis")
kurt0=data1_bool.kurt(axis=0)
print(kurt0)

print("\nkurtosis value in axis =1")


print("Pearson's Kurtosis and Fisher's Kurtosis")
kurt1=data1_bool.kurt(axis=1)
print(kurt1)

print("calculate Regression")
import numpy as np
import matplotlib.pyplot as plt
def estimate_coef(x, y):
# number of observations/points
n = np.size(x)

# mean of x and y vector


m_x = np.mean(x)
m_y = np.mean(y)

# calculating cross-deviation and deviation about x


SS_xy = np.sum(y * x) - n * m_y * m_x
SS_xx = np.sum(x * x) - n * m_x * m_x

# calculating regression coefficients


b_1 = SS_xy / SS_xx
b_0 = m_y - b_1 * m_x

return (b_0, b_1)

def plot_regression_line(x, y, b):


# plotting the actual points as scatter plot
plt.scatter(x, y, color="m",
marker="o", s=30)

# predicted response vector


y_pred = b[0] + b[1] * x

# plotting the regression line


plt.plot(x, y_pred, color="g")

# putting labels
plt.xlabel('x')
plt.ylabel('y')

# function to show plot


plt.show()

def main():
# observations / data
x = data1_bool["Obesity"]
y = data1_bool["Age"]

# estimating coefficients
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {} \
\nb_1 = {}".format(b[0], b[1]))

# plotting regression line


plot_regression_line(x, y, b)

if __name__ == "__main__":
main()
6 Apply and explore various plotting functions on UCI data sets.
a. Normal curves
b. Density and contour plots
c. Correlation and scatter plots
d. Histograms
e. Three-dimensional plotting

print("diabetes dataset")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Reading the dataset


diab=pd.read_csv("D:/edith esther/CS3361 DSC LAB/pimadataset.csv")

print("Finding if there are any null and Zero values in the data set")
print(diab.isnull().values.any())

print("find out how many Zero values are included in each variable")
print((diab.Pregnancies == 0).sum(),(diab.Glucose==0).sum(),
(diab.BloodPressure==0).sum(),(diab.SkinThickness==0).sum(),
(diab.Insulin==0).sum(),(diab.BMI==0).sum(),
(diab.DiabetesPedigreeFunction==0).sum(),(diab.Age==0).sum())

print("we’ll drop 0 values and create a our new dataset which can be used for
further analysis")
## Creating a dataset called 'dia' from original dataset 'diab' with excludes all
rows with have zeros only for Glucose, BP, Skinthickness, Insulin and BMI, as other
columns can contain Zero values.
drop_Glu=diab.index[diab.Glucose == 0].tolist()
drop_BP=diab.index[diab.BloodPressure == 0].tolist()
drop_Skin = diab.index[diab.SkinThickness==0].tolist()
drop_Ins = diab.index[diab.Insulin==0].tolist()
drop_BMI = diab.index[diab.BMI==0].tolist()
c=drop_Glu+drop_BP+drop_Skin+drop_Ins+drop_BMI
dia=diab.drop(diab.index[c])
print(dia.info())
print(dia.describe())

dia1 = dia[dia.Outcome==1]
dia0 = dia[dia.Outcome==0]
print(dia0)

# Computing the %age of diabetic and non-diabetic in the sample


Out0=len(dia[dia.Outcome==1])
Out1=len(dia[dia.Outcome==0])
Total=Out0+Out1
PC_of_1 = Out1*100/Total
PC_of_0 = Out0*100/Total
print(PC_of_1, PC_of_0)

cor = dia.corr(method ='pearson')


print(cor)
print("Counting the number of counts of unique values using 'value_counts()'")
print(diab["Outcome"].value_counts())

print("Calculating sum, mean and mode of a particular column.")


sum_data = diab["BMI"].sum()
mean_data = diab["BMI"].mean()
median_data = diab["BMI"].median()
mode_data = diab["BMI"].mode()
print("Sum:", sum_data, "\nMean:", mean_data, "\nMedian:", median_data,"\
nMode:",mode_data)

print("Calculating variance and standard deviation")


var_data = diab["BMI"].var()
std_data = diab["BMI"].std()
print("Variance:", var_data, "\nStandard Deviation:", std_data)

print("Extracting minimum and maximum from a column")


min_data = diab["BMI"].min()
max_data = diab["BMI"].max()
print("Minimum:", min_data, "\nMaximum:", max_data)

print("Cleaning and detecting missing values")


print(diab.isnull())

print("create another column in the data frame with number for different species")
Target = []
for i in range(len(diab['Outcome'])):
if diab['Outcome'][i] == 0:
Target.append("Negative")
else:
Target.append('Positive')
diab['Target'] = Target
print(diab.to_string)

print(" Group the dataset based on species ")


print(diab.groupby('Target').size())

# skip the na values


print("find skewness in each row")
print("Use skew() function to find the skewness of the data over the column axis.")
skew1=diab.skew(axis = 1, skipna = True)
print("\nSkew value in axis =1:",skew1)

print("skewness along the index axis")


skew0=diab.skew(axis = 0, skipna = True)
print("\nSkew value in axis =0:",skew0)

print("\nkurtosis value in axis =0")


print("Pearson's Kurtosis and Fisher's Kurtosis")
kurt0=diab.kurt(axis=0)
print(kurt0)

print("\nkurtosis value in axis =1")


print("Pearson's Kurtosis and Fisher's Kurtosis")
kurt1=diab.kurt(axis=1)
print(kurt1)

# Import required package


import matplotlib.pyplot as plt

# set the figure size


plt.rcParams['figure.figsize'] = [20, 10];

# Draw histograms for all attributes


diab.hist()
plt.show()

# Draw box and whisker plots for all attributes


diab.plot(kind= 'box', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()

# Draw Histogram plots for all attributes


diab.plot(kind= 'hist', subplots=True, layout=(3,3), sharex=False, sharey=False)
plt.show()

# Density plots for all attributes


#diab.plot(kind= 'density', subplots=True, layout=(3,3), sharex=False)
#plt.show()

print("Multivariate Plots")
# Compute the correlation matrix
correlations = diab.corr(method = 'pearson')

# Correlations between all pairs of attributes


# import required package
import numpy as np

# plot correlation matrix


fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
names = diab.columns

# Rotate x-tick labels by 90 degrees


ax.set_xticklabels(names,rotation=90)
ax.set_yticklabels(names)
plt.show()

# Import required package


from pandas.plotting import scatter_matrix
plt.rcParams['figure.figsize'] = [20, 20]
# Plotting Scatterplot Matrix
scatter_matrix(diab)
plt.show()

7 Visualizing Geographic Data with Basemap

print("Visualizing Geographic Data with Basemap")


from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
from matplotlib import image
import matplotlib.image as mpimg
import numpy as np

# input desired coordinates


my_coords = [38.9719980,-76.9219820]

# How much to zoom from coordinates (in degrees)


zoom_scale = 1

# Setup the bounding box for the zoom and bounds of the map
bbox = [my_coords[0]-zoom_scale,my_coords[0]+zoom_scale,\
my_coords[1]-zoom_scale,my_coords[1]+zoom_scale]

plt.figure(figsize=(12,6))
# Define the projection, scale, the corners of the map, and the resolution.
m = Basemap(projection='merc',llcrnrlat=bbox[0],urcrnrlat=bbox[1],\
llcrnrlon=bbox[2],urcrnrlon=bbox[3],lat_ts=10,resolution='i')

# Draw coastlines and fill continents and water with color


m.drawcoastlines()
m.fillcontinents(color='peru',lake_color='dodgerblue')

# draw parallels, meridians, and color boundaries


m.drawparallels(np.arange(bbox[0],bbox[1],(bbox[1]-bbox[0])/5),labels=[1,0,0,0])
m.drawmeridians(np.arange(bbox[2],bbox[3],(bbox[3]-bbox[2])/
5),labels=[0,0,0,1],rotation=45)
m.drawmapboundary(fill_color='dodgerblue')

# build and plot coordinates onto map


x,y = m(my_coords[1],my_coords[0])
m.plot(x,y,marker='D',color='r')
plt.title("Geographic Point Test")
plt.savefig('coordinate_test.png', format='png', dpi=500)
plt.show()

fig = plt.figure(figsize=(8, 8))


m = Basemap(projection='lcc', resolution=None,width=8E6, height=8E6,lat_0=45,
lon_0=-100,)
m.etopo(scale=0.5, alpha=0.5)
# Map (long, lat) to (x, y) for plotting
x, y = m(-122.3, 47.6)
plt.plot(x, y, 'ok', markersize=5)
plt.text(x, y, ' Seattle', fontsize=12);
plt.show()

fig = plt.figure(figsize = (12,12))


m = Basemap()
m.drawcoastlines()
plt.title("Coastlines", fontsize=20)
plt.show()

fig = plt.figure(figsize = (12,12))


m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='dashed', color='red')
plt.title("Coastlines", fontsize=20)
plt.show()
fig = plt.figure(figsize = (12,12))
m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries()
plt.title("Country boundaries", fontsize=20)
plt.show()

fig = plt.figure(figsize = (12,12))


m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries(linewidth=1.0, linestyle='solid', color='k')
plt.title("Country boundaries", fontsize=20)
plt.show()
plt.show()

fig = plt.figure(figsize = (12,12))


m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries(linewidth=1.0, linestyle='solid', color='k')
m.drawrivers(linewidth=0.5, linestyle='solid', color='#0000ff')
plt.title("Major rivers", fontsize=20)
plt.show()

fig = plt.figure(figsize = (12,12))


m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries(linewidth=1.0, linestyle='solid', color='k')
m.fillcontinents()
plt.title("Color filled continents", fontsize=20)
plt.show()

fig = plt.figure(figsize = (12,12))


m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries(linewidth=1.0, linestyle='solid', color='k')
m.fillcontinents(color='coral',lake_color='aqua', alpha=0.9)
plt.title("Color filled continents", fontsize=20)
plt.show()

fig = plt.figure(figsize = (12,12))


m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries(linewidth=1.0, linestyle='solid', color='k')
m.fillcontinents(color='coral',lake_color='aqua')
m.drawmapboundary(color='b', linewidth=2.0, fill_color='aqua')
plt.title("Filled map boundary", fontsize=20)
plt.show()

fig = plt.figure(figsize = (12,12))


m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries(linewidth=1.0, linestyle='solid', color='k')
m.fillcontinents(color='coral',lake_color='aqua')
m.drawmeridians(range(0, 360, 20), color='k', linewidth=1.0, dashes=[4, 4],
labels=[0, 0, 1, 1])
plt.title("Longitude lines", fontsize=20, pad=30)
plt.show()

fig = plt.figure(figsize = (12,12))


m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries(linewidth=1.0, linestyle='solid', color='k')
m.fillcontinents(color='coral',lake_color='aqua')
m.drawparallels(range(-90, 100, 10), color='k', linewidth=1.0, dashes=[4, 4],
labels=[1, 1, 0, 0])
plt.title("Latitude lines", fontsize=20)
plt.show()

fig = plt.figure(figsize = (12,12))


m = Basemap()
m.drawcoastlines(linewidth=1.0, linestyle='solid', color='black')
m.drawcountries(linewidth=1.0, linestyle='solid', color='k')
m.fillcontinents(color='coral',lake_color='aqua')
m.drawmeridians(range(0, 360, 20), color='k', linewidth=1.0, dashes=[4, 4],
labels=[0, 0, 0, 1])
m.drawparallels(range(-90, 100, 10), color='k', linewidth=1.0, dashes=[4, 4],
labels=[1, 0, 0, 0])
plt.ylabel("Latitude", fontsize=15, labelpad=35)
plt.xlabel("Longitude", fontsize=15, labelpad=20)
plt.show()

fig = plt.figure(figsize = (10,8))


m = Basemap(projection='merc',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-
180,urcrnrlon=180)
m.drawcoastlines()
m.fillcontinents(color='tan',lake_color='lightblue')
m.drawcountries(linewidth=1, linestyle='solid', color='k' )
m.drawmapboundary(fill_color='lightblue')
plt.title("Mercator Projection", fontsize=20)
plt.show()

fig = plt.figure(figsize = (10,8))


m = Basemap(projection='cyl',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-
180,urcrnrlon=180)
m.drawcoastlines()
m.fillcontinents(color='tan',lake_color='lightblue')
m.drawcountries(linewidth=1, linestyle='solid', color='k' )
m.drawmapboundary(fill_color='lightblue')
plt.title(" Cylindrical Equidistant Projection", fontsize=20)
plt.show()

#empty circle
plt.figure(figsize=(8, 8))
m = Basemap(projection='ortho', resolution=None, lat_0=50, lon_0=-100)
m.bluemarble(scale=0.5);
plt.title("Empty circle", fontsize=18)
plt.show()

fig = plt.figure(figsize = (10,8))


m = Basemap(projection='ortho', lon_0 = 25, lat_0 = 10)
m.drawcoastlines()
m.fillcontinents(color='tan',lake_color='lightblue')
m.drawcountries(linewidth=1, linestyle='solid', color='k' )
m.drawmapboundary(fill_color='lightblue')
plt.title("Orthographic Projection", fontsize=18)
plt.show()

fig = plt.figure(figsize = (10,8))


m = Basemap(projection='robin',llcrnrlat=-80,urcrnrlat=80,llcrnrlon=-
180,urcrnrlon=180, lon_0 = 0, lat_0 = 0)
m.drawcoastlines()
m.fillcontinents(color='tan',lake_color='lightblue')
m.drawcountries(linewidth=1, linestyle='solid', color='k' )
m.drawmapboundary(fill_color='lightblue')
plt.title(" Robinson Projection", fontsize=20)
plt.show()

fig = plt.figure(figsize = (10,8))


m = Basemap(projection='cyl',llcrnrlon=32.5,llcrnrlat=3,urcrnrlon=49,urcrnrlat=15,
resolution = 'i')
m.drawlsmask(land_color = "#ddaa66", ocean_color="#7777ff", resolution = 'i',
lakes=True, grid=1.25)
m.drawcountries(linewidth=1, linestyle='solid', color='k' )
plt.title("Land-sea mask image", fontsize=20)
plt.show()

fig = plt.figure(figsize = (10,8))


m = Basemap(projection='cyl',llcrnrlon=32.5,llcrnrlat=3,urcrnrlon=49,urcrnrlat=15,
resolution='i')
m.bluemarble(scale=1.0)
m.drawcoastlines()
m.drawcountries(linewidth=1, linestyle='solid', color='k' )
plt.title("NASA Blue Marble image as background map", fontsize=18)
plt.show()

fig = plt.figure(figsize = (10,8))


m = Basemap(projection='cyl',llcrnrlon=32.5,llcrnrlat=3,urcrnrlon=49,urcrnrlat=15,
resolution='i')
m.shadedrelief()
m.drawcoastlines()
m.drawcountries(linewidth=1, linestyle='solid', color='k' )
plt.title("Shaded relief image as background map", fontsize=18)
plt.show()

fig = plt.figure(figsize = (10,8))


m = Basemap(projection='cyl',llcrnrlon=32.5,llcrnrlat=3,urcrnrlon=49,urcrnrlat=15,
resolution='i')
m.etopo(scale=1.2)
m.drawcoastlines()
m.drawcountries(linewidth=1, linestyle='solid', color='k' )
plt.title("Shaded relief image as background map", fontsize=18)
plt.show()

CONTENT BEYOND SYLLABUS

9. Reading data from text files, Excel and the web and exploring various
commands for doing descriptive analytics on the Housing dataset.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
housing = pd.read_csv('D:\edith esther\CS3361 DSC LAB\housing.csv')
print(housing.shape)
print(housing.head())
print(housing.head(10))
print(housing.tail())
print(housing.plot("median_income", "median_house_value"))
plt.show()
print(housing.plot.scatter("median_income", "median_house_value"))
plt.show()
x_train, x_test, y_train, y_test = train_test_split(housing.median_income,
housing.median_house_value,
test_size = 0.2)
regr = LinearRegression()
regr.fit(np.array(x_train).reshape(-1,1), y_train)
preds = regr.predict(np.array(x_test).reshape(-1,1))
print(y_test.head())
print(preds)
residuals = preds - y_test
print(plt.hist(residuals))
plt.show()
print(mean_squared_error(y_test, preds) ** 0.5)

You might also like