Data Science Manual
Data Science Manual
Program:
Import pandas as pd Data={“calories”:
[420,380,390],”duration”:[50,40,45]} #load data into a
dataframe
df=pd.DataFrame(Data)
print(df.loc[0])
Output:
calories 420
duration 50
Name:0,dtype:int64
Exp:02 Basic Plot Using Matplotlib
Program:
Import matplotlib.pyplot as plt
a =[1,2,3,4,5]
b=[0,0.6,0.2,15,10,8,16,21]
plt.plot(a)
#o is for circle and r is for red
plt.plot(b,”or”)
plt.plot(list(range(0 , 22 , 3)))
plt.xlabel(“day ”)
plt.ylabel(“temp ”)
c=[4 ,2 ,6, 8, 3, 20, 13, 15]
plt.plot(c,label=”4th rep”)
#get current axes command
ax=plt.gca()
#get command over the individual boundary line of the graph body
ax.spines[“right”].set_visible(False)
ax.spines[“top”].set_visible(False)
#set the range or the bounds of the left boundary line to fixed range
ax.spines[“left”].set_bounds(-3,40)
#set the interval by which the x-axis set the marks
plt.ticks(list(range(-3,10)))
#set the intervals by which y-axis set the marks
Plt.yticks(list(range(-3, 20, 3)))
#legend denotes that what color signifies what
ax.legend([“1st rep”, ”2nd rep”, ”3rd rep”, “4th rep”])
#annotate command help to write on the graph any text xy denotes the
position #on the graph
plt.annotate(“Temperature v / s days”, xy = (1.01, -2.15))
plt.title(“all features discussed”)
plt.show()
Output:
Exp: 03 FrequencyDistributions,Averages, Variability
Program:
#program to get average of a list
Import numpy as np
#taking a list of elements
List1 = [2, 40, 2, 502, 177, 7, 9]
#calulate average using average()
print(np.average(List1))
output:
105.571428571422857
Output:
4.0
Program:
# program for Normal
Curve Import
matplotlib.pyplot as plt Import
numpy as np
mu , sigma =0.5, 0.1
s = np.random.normal(mu , sigma , 1000)
#create the bins and histogram
Count, bins , ignored = plt.hist(s, 20, normal=True)
Output:
#programfor Correlation and Scatter Plots
Import sklearn
Import numpy as np
Import matplotlib.pyplot as plt
Import pandas as pd
y =pd.Series([1, 2, 3, 4, 3, 5, 4])
x = pd.Series([1, 2, 3, 4, 5, 6, 7])
correlation = y.corr(x)
correlation
Output:
0.8603090020146067
#Correlation coefficient
Import math
def correlationCoefficient(x, y, n):
sum_x = 0
sum_y = 0
sum_xy = 0
squareSum_x =0
squareSum_y =0
i=0
while i< n:
sum_x = sum_x +x[i]
sum_y = sum_y +y[i]
sum_xy = sum_xy + x[i]*y[i]
squareSum_x = squareSum_x+ x[i]*x[i]
squareSum_y = squareSum_y +y[i]*y[i]
i+=1
#formula for calculating correlation coefficient
corr = (float)(n*sum_xy-sum_x*sum_y)/ (float)(math.sqrt((n*
squareSum_x – sum_x * sum_x)* (n* squareSum_y – sum_y *sum_y)))
return corr
x = [15, 18, 21, 24, 27]
y = [25, 25, 27, 31, 32]
n = len(x)
print(“{0: .6f}”.format(correlationCoefficient(x, y, n)))
Output:
0.953463
Exp:05 Regression
Program:
Import numpy as np
Import matplotlib.pyplot as plt
def estimate_coef(x, y):
#number of observations/points
n = np.size(x)
m_x = np.mean(x)
m_y = np.mean(y)
ss_xy = np.sum(y*x) – n*m_y*m_x
ss_xx = np.sum(x*x) – n*m_x*m_x
#calculate regression coefficients
b_1 = ss_xy / ss_xx
b_0 = m_y – b_1*m_x
return (b_0, b_1)
def plot_regression_line(x, y, b):
plt.scatter(x, y, color = “m”, marker = “o”, s = 30)
#predicted response vector
y_pred = b[0] + b[1]*x
plt.plot(x, y_pred, color =”g’)
plt.xlabel(“x”)
plt.ylabel(“y”)
plt.show()
def main():
#data/observation
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
b = estimate_coef(x, y)
print(“estimated coefficients:\nb_0={}\\nb_1 = {}”.format(b[0], b[1]))
plot_regression_line(x, y, b)
if name==” main ”:
main()
Output:
Exp:06 Z-Test
Program:
Import math
Import numpy as np
from numpy.random import randn
from statsmodels.stats.weightstats import ztest
#generate a random array of 50 number having mean 110 and sd 15
# similar to the IQ scores data we assume above
mean_iq = 110
sd_iq= 15/math.sqrt(50)
alpha = 0.05
null_mean = 100
data = sd_iq*randn(50) + mean_iq
print(“mean = %.2f stdv=%.2f” %(np.mean(data), np.std(data)))
ztest_Score,p_value=ztest(data,value=null_mean,alternative=
”larger”)
if(p_value< alpha):
print(“Reject Null Hypothesis”)
else:
print(“Fail To Reject Null Hypothesis”)
Output:
Reject Null Hypothesis
Exp:07 T-Test
Program:
Import numpy as np
from scipy import stats
N = 10
x = np.randam.randn(N) + 2
y = np.randam.randn(N)
var_x = x.var(ddof = 1)
var_y = y.var(ddof = 1)
# Standard deviation
SD = np.sqrt((var_x + var_y) / 2)
print(“Standard deviation =”, SD)
# Calculate the T-Statistics
tval = (x.mean() – y.mean()) / (SD * np.sqrt(2 / N))
#Compare with the critical T-Value
#Degrees of freedom
dof = 2 * N – 2
# p-value after comparision with the T-Statistics
pval = 1 – stats.t.cdf(tval, df = dof)
print(“t = “ + str(tval))
print(“p= ” + str(2 * pavl))
tval2, pval2 = stats.ttest_ind(x,
y) print(“t = ” + str(tval2))
print(“p= ” + str(pval2))
Output:
Standard deviation = 0.7642398582227466
t= 4.87688162540348
p= 0.0001212767169695983
t= 4.876881625403479
p= 0.00012127671696957205
Exp: 08 Anova
Program:
#installing the package
install.packages(“dplyr”)
# loading the package
library(dplyr)
# variance in mean within ground and between group
boxplot(mtcars$disp~factor(mtcars$gear), xlab= ”gear”, ylab = ”disp”)
#step 1: Setup Null Hypothesis and Alternate Hypothesis
#H0 = mu = mu01 = mu02(there is no difference between average displacement
#for different gear)
#H1 = Not all mean are equal
#step 2: calculate test statistics using aov function
mtcars_aovaov(mtcars$disp~factor(mtcars$gear))
summary(mtcars_aov)
#step 3: calculate F-Critical value
#For 0.05 Significant value, critical value = alpha = 0.05
#stpe 4: compare test statistics with F-critical value and conclude test.p<alpha ,
#Reject Null Hypothesis
Output:
Program:
Import pandas as pd
Import numpy as np
Import matplotlib.pyplot as plt
Import seaborn as sns
from sklearn.datasets import load_boston
sns.set(style=”ticks”, color_codes=True)
plt.rcParams[“figure.figsize”] = (8, 5)
plt.rcParams[“figure.dpi”] = 150
print(boston.keys())
print(boston.DESCR)
df = pd.DataFrame(boston.data, columns = boston.feature_names)
df.head()
print(df.columns)
print(df.head())
# plotting heatmap for overall data set
sns.heatmap(df.corr(), square = True, cmap = “RdYLGn”)
#now let’s plot a regression plot to see the correlation between RM and MEDV
sns.import(x = “RM”, y = “MEDV”, data = df)
Exp: 10 Building And Validating Logistics Models
Program:
Import statsmodels.api as sm
Import pandas as pd
#loading the training dataset
df = pd.read_csv(“logit_train1.csv”, index_col = 0)
xtrain = df[[“gmat”, “gpa”, “work_experience”]]
ytain = df[[“admitted”]]
log_reg = sm.Logit(ytrain, xtrain).fit()
Output:
Optimization terminated successfully.
Current function value: 0.352707
Iterations 8
print(log_reg.summary())
Output:
#predicting on new data:
df = pd.read_csv(“logit_test1.csv”, index_col = 0)
xtest = df[[“gmat”, ”gpa”, ”work_experience”]]
ytest = df[“admitted”]
yhat = log_reg.predict(Xtest)
prediction = list(map(round, yhat))
print(“Actual value”, list(ytest.value))
print(“predictions :”, prediction)
Output:
Optimization terminated successfully.
Current function value: 0.352707
Iterations 8
#testing the accuracy of the model
from sklearn.metrics import(cpnfusion_matrix, accuracy_score)
cm = confusion_matrix(ytest, prediction)
print(“Confusion matrix : \n”, cm)
print(“Test accuracy = ”, accuracy_score(ytest, prediction))
Output:
Comfusion matrix:
[[6 0]
[2 2]]
Test accuracy =0.8
Exp: 11 Time Series Analysis
Program:
Import warnings
Import itertools
Import numpy as np
Import matplotlib.pyplot as plt
warnings.filterwarnings(“ignore”)
plt.style.use(“fivethirtyeight”)
Import pandas as pd
Import statsmodels.api as sm
Import matplotlibmatplotlib.rcParams[“axes.labelsize”] = 14
matplotlib.rcParams[“xtick.labelsize”] = 12
matplotlib.rcParams[“ytick.labelsize”] = 12
matplotlib.rcParams[“text.color”] = “k”
df = pd.read_excel(“superstore.xls”)
furniture = df.loc[df[“category”] == “Furniture”]
furniture[“Order Data”].min(), furniture[“Order Data”].max()
Timestamp(“2014-01-06 00:00:00”), Timestamp(“2017-12-30 00:00:00”)
cols=[“Row ID”, “Order ID”, “Ship Data”, “Ship mode”, “Customer ID”,
“Customer name”, “Segment”, “Country”, “City”, “State”, “Postal Code”,
“Region”, “Product ID”, “Category”, “Sub-Category”, “Product Name”,
“Quantity”, “Discount”, “Profit”]
furniture.drop(cols, axis =1, inplace=True)
furniture=furniture.sort_values(“Order Data”)furniture.isnull().sum()
furniture = furniture.groupby(“OrderData”)[“Sales”].sum().reset_index()
furniture = furniture.set_index(“OrderData”)
furniture.index()
y = furniture[“Sales”].resample(“MS”).mean()
y[“2017:”]
y.plot(figsize = (15, 6))
plt.show()