0% found this document useful (0 votes)
42 views16 pages

Data Science Manual

Data Science Lab Experiment

Uploaded by

KRISHNAVENI R
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
42 views16 pages

Data Science Manual

Data Science Lab Experiment

Uploaded by

KRISHNAVENI R
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 16

Exp:01 Working With Data Frame

Program:
Import pandas as pd Data={“calories”:
[420,380,390],”duration”:[50,40,45]} #load data into a
dataframe
df=pd.DataFrame(Data)
print(df.loc[0])

Output:
calories 420
duration 50
Name:0,dtype:int64
Exp:02 Basic Plot Using Matplotlib

Program:
Import matplotlib.pyplot as plt
a =[1,2,3,4,5]
b=[0,0.6,0.2,15,10,8,16,21]
plt.plot(a)
#o is for circle and r is for red
plt.plot(b,”or”)
plt.plot(list(range(0 , 22 , 3)))
plt.xlabel(“day ”)
plt.ylabel(“temp ”)
c=[4 ,2 ,6, 8, 3, 20, 13, 15]
plt.plot(c,label=”4th rep”)
#get current axes command
ax=plt.gca()
#get command over the individual boundary line of the graph body
ax.spines[“right”].set_visible(False)
ax.spines[“top”].set_visible(False)
#set the range or the bounds of the left boundary line to fixed range
ax.spines[“left”].set_bounds(-3,40)
#set the interval by which the x-axis set the marks
plt.ticks(list(range(-3,10)))
#set the intervals by which y-axis set the marks
Plt.yticks(list(range(-3, 20, 3)))
#legend denotes that what color signifies what
ax.legend([“1st rep”, ”2nd rep”, ”3rd rep”, “4th rep”])
#annotate command help to write on the graph any text xy denotes the
position #on the graph
plt.annotate(“Temperature v / s days”, xy = (1.01, -2.15))
plt.title(“all features discussed”)
plt.show()
Output:
Exp: 03 FrequencyDistributions,Averages, Variability

Program:
#program to get average of a list
Import numpy as np
#taking a list of elements
List1 = [2, 40, 2, 502, 177, 7, 9]
#calulate average using average()
print(np.average(List1))

output:
105.571428571422857

#program to get variance of a list


Import numpy as np
List2= [2, 4, 4, 4, 5, 5, 7, 9]
#calculate variance using var()
Print(np.var(List2))

Output:
4.0

#program to get standard deviation of a list


import numpy as np
list3 =[290, 124, 127, 899]
print(np.std(list3))
Output:
318.357550344541907

Exp:04 Normal Curves , Correlation and Scatter Plots ,


Correlation Coefficient

Program:
# program for Normal
Curve Import
matplotlib.pyplot as plt Import
numpy as np
mu , sigma =0.5, 0.1
s = np.random.normal(mu , sigma , 1000)
#create the bins and histogram
Count, bins , ignored = plt.hist(s, 20, normal=True)
Output:
#programfor Correlation and Scatter Plots
Import sklearn
Import numpy as np
Import matplotlib.pyplot as plt
Import pandas as pd
y =pd.Series([1, 2, 3, 4, 3, 5, 4])
x = pd.Series([1, 2, 3, 4, 5, 6, 7])
correlation = y.corr(x)
correlation
Output:
0.8603090020146067
#Correlation coefficient
Import math
def correlationCoefficient(x, y, n):
sum_x = 0
sum_y = 0
sum_xy = 0
squareSum_x =0
squareSum_y =0
i=0
while i< n:
sum_x = sum_x +x[i]
sum_y = sum_y +y[i]
sum_xy = sum_xy + x[i]*y[i]
squareSum_x = squareSum_x+ x[i]*x[i]
squareSum_y = squareSum_y +y[i]*y[i]
i+=1
#formula for calculating correlation coefficient
corr = (float)(n*sum_xy-sum_x*sum_y)/ (float)(math.sqrt((n*
squareSum_x – sum_x * sum_x)* (n* squareSum_y – sum_y *sum_y)))
return corr
x = [15, 18, 21, 24, 27]
y = [25, 25, 27, 31, 32]
n = len(x)
print(“{0: .6f}”.format(correlationCoefficient(x, y, n)))
Output:
0.953463
Exp:05 Regression

Program:
Import numpy as np
Import matplotlib.pyplot as plt
def estimate_coef(x, y):
#number of observations/points
n = np.size(x)
m_x = np.mean(x)
m_y = np.mean(y)
ss_xy = np.sum(y*x) – n*m_y*m_x
ss_xx = np.sum(x*x) – n*m_x*m_x
#calculate regression coefficients
b_1 = ss_xy / ss_xx
b_0 = m_y – b_1*m_x
return (b_0, b_1)
def plot_regression_line(x, y, b):
plt.scatter(x, y, color = “m”, marker = “o”, s = 30)
#predicted response vector
y_pred = b[0] + b[1]*x
plt.plot(x, y_pred, color =”g’)
plt.xlabel(“x”)
plt.ylabel(“y”)
plt.show()
def main():
#data/observation
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
b = estimate_coef(x, y)
print(“estimated coefficients:\nb_0={}\\nb_1 = {}”.format(b[0], b[1]))
plot_regression_line(x, y, b)
if name==” main ”:
main()
Output:
Exp:06 Z-Test

Program:
Import math
Import numpy as np
from numpy.random import randn
from statsmodels.stats.weightstats import ztest
#generate a random array of 50 number having mean 110 and sd 15
# similar to the IQ scores data we assume above
mean_iq = 110
sd_iq= 15/math.sqrt(50)
alpha = 0.05
null_mean = 100
data = sd_iq*randn(50) + mean_iq
print(“mean = %.2f stdv=%.2f” %(np.mean(data), np.std(data)))
ztest_Score,p_value=ztest(data,value=null_mean,alternative=
”larger”)
if(p_value< alpha):
print(“Reject Null Hypothesis”)
else:
print(“Fail To Reject Null Hypothesis”)

Output:
Reject Null Hypothesis
Exp:07 T-Test

Program:
Import numpy as np
from scipy import stats
N = 10
x = np.randam.randn(N) + 2
y = np.randam.randn(N)
var_x = x.var(ddof = 1)
var_y = y.var(ddof = 1)
# Standard deviation
SD = np.sqrt((var_x + var_y) / 2)
print(“Standard deviation =”, SD)
# Calculate the T-Statistics
tval = (x.mean() – y.mean()) / (SD * np.sqrt(2 / N))
#Compare with the critical T-Value
#Degrees of freedom
dof = 2 * N – 2
# p-value after comparision with the T-Statistics
pval = 1 – stats.t.cdf(tval, df = dof)
print(“t = “ + str(tval))
print(“p= ” + str(2 * pavl))
tval2, pval2 = stats.ttest_ind(x,
y) print(“t = ” + str(tval2))
print(“p= ” + str(pval2))

Output:
Standard deviation = 0.7642398582227466
t= 4.87688162540348
p= 0.0001212767169695983
t= 4.876881625403479
p= 0.00012127671696957205

Exp: 08 Anova

Program:
#installing the package
install.packages(“dplyr”)
# loading the package
library(dplyr)
# variance in mean within ground and between group
boxplot(mtcars$disp~factor(mtcars$gear), xlab= ”gear”, ylab = ”disp”)
#step 1: Setup Null Hypothesis and Alternate Hypothesis
#H0 = mu = mu01 = mu02(there is no difference between average displacement
#for different gear)
#H1 = Not all mean are equal
#step 2: calculate test statistics using aov function
mtcars_aovaov(mtcars$disp~factor(mtcars$gear))
summary(mtcars_aov)
#step 3: calculate F-Critical value
#For 0.05 Significant value, critical value = alpha = 0.05
#stpe 4: compare test statistics with F-critical value and conclude test.p<alpha ,
#Reject Null Hypothesis
Output:

Exp:09 Building And Validating Linear Models

Program:
Import pandas as pd
Import numpy as np
Import matplotlib.pyplot as plt
Import seaborn as sns
from sklearn.datasets import load_boston
sns.set(style=”ticks”, color_codes=True)
plt.rcParams[“figure.figsize”] = (8, 5)
plt.rcParams[“figure.dpi”] = 150
print(boston.keys())
print(boston.DESCR)
df = pd.DataFrame(boston.data, columns = boston.feature_names)
df.head()
print(df.columns)
print(df.head())
# plotting heatmap for overall data set
sns.heatmap(df.corr(), square = True, cmap = “RdYLGn”)

#now let’s plot a regression plot to see the correlation between RM and MEDV
sns.import(x = “RM”, y = “MEDV”, data = df)
Exp: 10 Building And Validating Logistics Models
Program:
Import statsmodels.api as sm
Import pandas as pd
#loading the training dataset
df = pd.read_csv(“logit_train1.csv”, index_col = 0)
xtrain = df[[“gmat”, “gpa”, “work_experience”]]
ytain = df[[“admitted”]]
log_reg = sm.Logit(ytrain, xtrain).fit()
Output:
Optimization terminated successfully.
Current function value: 0.352707
Iterations 8
print(log_reg.summary())
Output:
#predicting on new data:
df = pd.read_csv(“logit_test1.csv”, index_col = 0)
xtest = df[[“gmat”, ”gpa”, ”work_experience”]]
ytest = df[“admitted”]
yhat = log_reg.predict(Xtest)
prediction = list(map(round, yhat))
print(“Actual value”, list(ytest.value))
print(“predictions :”, prediction)
Output:
Optimization terminated successfully.
Current function value: 0.352707
Iterations 8
#testing the accuracy of the model
from sklearn.metrics import(cpnfusion_matrix, accuracy_score)
cm = confusion_matrix(ytest, prediction)
print(“Confusion matrix : \n”, cm)
print(“Test accuracy = ”, accuracy_score(ytest, prediction))
Output:
Comfusion matrix:
[[6 0]
[2 2]]
Test accuracy =0.8
Exp: 11 Time Series Analysis
Program:
Import warnings
Import itertools
Import numpy as np
Import matplotlib.pyplot as plt
warnings.filterwarnings(“ignore”)
plt.style.use(“fivethirtyeight”)
Import pandas as pd
Import statsmodels.api as sm
Import matplotlibmatplotlib.rcParams[“axes.labelsize”] = 14
matplotlib.rcParams[“xtick.labelsize”] = 12
matplotlib.rcParams[“ytick.labelsize”] = 12
matplotlib.rcParams[“text.color”] = “k”
df = pd.read_excel(“superstore.xls”)
furniture = df.loc[df[“category”] == “Furniture”]
furniture[“Order Data”].min(), furniture[“Order Data”].max()
Timestamp(“2014-01-06 00:00:00”), Timestamp(“2017-12-30 00:00:00”)
cols=[“Row ID”, “Order ID”, “Ship Data”, “Ship mode”, “Customer ID”,
“Customer name”, “Segment”, “Country”, “City”, “State”, “Postal Code”,
“Region”, “Product ID”, “Category”, “Sub-Category”, “Product Name”,
“Quantity”, “Discount”, “Profit”]
furniture.drop(cols, axis =1, inplace=True)
furniture=furniture.sort_values(“Order Data”)furniture.isnull().sum()
furniture = furniture.groupby(“OrderData”)[“Sales”].sum().reset_index()

furniture = furniture.set_index(“OrderData”)
furniture.index()
y = furniture[“Sales”].resample(“MS”).mean()
y[“2017:”]
y.plot(figsize = (15, 6))
plt.show()

You might also like