Exp 4 Statistical Data Analysis With Python Sdk Ok
Exp 4 Statistical Data Analysis With Python Sdk Ok
# Outcomes
aces = 4
Output:-
0.08
8.0%
Example no.02
# Create function that returns probability percent rounded to one decimal place
def event_probability(event_outcomes, sample_space):
probability = (event_outcomes / sample_space) * 100
return round(probability, 1)
# Sample Space
cards = 52
# Determine the probability of drawing a heart
hearts = 13
heart_probability = event_probability(hearts, cards)
Output:-
Probability of Heart :- 25.0%
Probability of Face Card :- 23.1%
Probability of Queen of Hearts :- 1.9%
Permutations
Example no.3
# Permutations Code
import math
n=4
k=2
Output:-
12.0
Example no.4
# Combinations Code
n = 52
k=2
# Determine Permutations
Permutations = math.factorial(n) / math.factorial(n - k)
# Determine Combinations and print result
Combinations = Permutations / math.factorial(k)
print(Combinations)
Output:-
1326.0
# Plot a histogram
_ = plt.hist(random_numbers)
Output:-
Plotting a Histogram of Iris Data
#Example no.06
# Import plotting modules
import matplotlib.pyplot as plt
import seaborn as sns
iris = pd.read_csv('../input/iris.data.csv')
print(iris.head())
Output:-
5.1 3.5 1.4 0.2 Iris-setosa
0 4.9 3.0 1.4 0.2 Iris-setosa
1 4.7 3.2 1.3 0.2 Iris-setosa
2 4.6 3.1 1.5 0.2 Iris-setosa
3 5.0 3.6 1.4 0.2 Iris-setosa
4 5.4 3.9 1.7 0.4 Iris-setosa
Computing the ECDF
#Example no.7
# Compute ECDF for versicolor data: x_vers, y_vers
x_vers, y_vers = ecdf(versicolor_petal_length)
# Generate plot
plt.plot(x_vers,y_vers, marker='.',linestyle='none')
# Label the axes
plt.xlabel('Petal Length')
plt.ylabel('ECDF')
# Display the plot
plt.show()
Output:-
#Example no.8
# Compute ECDFs
x_set, y_set = ecdf(setosa_petal_length)
x_vers, y_vers = ecdf(versicolor_petal_length)
x_virg, y_virg = ecdf(virginica_petal_length)
Output:-
The np.random module and Bernoulli trial
#Example no.9
# Seed random number generator
np.random.seed(42)
# Plot the histogram with default number of bins; label your axes
_ = plt.hist(n_defaults, normed=True)
_ = plt.xlabel('number of defaults out of 100 loans')
_ = plt.ylabel('probability')
Output:-
#Example no.11
# Draw 100000 samples from Normal distribution with stds of interest:
samples_std1, samples_std3, samples_std10
samples_std1=np.random.normal(20, 1, size=100000)
samples_std3=np.random.normal(20, 3, size=100000)
samples_std10=np.random.normal(20, 10, size=100000)
# Make histograms
plt.hist(samples_std1,normed=True, bins=100, histtype='step')
plt.hist(samples_std3,normed=True, bins=100, histtype='step')
plt.hist(samples_std10,normed=True, bins=100, histtype='step')
# Make a legend, set limits and show plot
_ = plt.legend(('std = 1', 'std = 3', 'std = 10'))
plt.ylim(-0.01, 0.42)
plt.show()
Output:-
Sampling & Sample Distribution
#Example no.1
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
Output:-
#Example no.2
np.random.seed(11)
sample_7_and_above = data[data['quality'].isin([7, 8])].sample(10)[['alcohol']].re-
set_index().drop(columns = ['index'])
sample_7_and_above
Output:-
#Example no.3
sns.distplot(sample_7_and_above["alcohol"], hist=False)
title = "X_bar_1 = " + str(x_bar) + ", s1 = "+ str(s)
plt.title(title)
Output:-
Sampling distribution
#Example no.4
seed = np.arange(0, 9)
x_bar = []
std_dev = []
for s in seed:
np.random.seed(s)
sample_7_and_above = data[data[‘quality’].isin([7, 8])].sample(10)[[‘alco-
hol’]].reset_index().drop(columns = [‘index’])
x_bar.append(np.mean(sample_7_and_above[“alcohol”]))
std_dev.append(np.std(sample_7_and_above[“alcohol”]))
Output:-
#Example no.5
sns.distplot(samples["Sample Means (X_bar)"])
plt.title("Distribution of the sample means")
Output:-
#Example no.5
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))
ax = fig.add_subplot(3, 2, 1)
sns.distplot(data["alcohol"])
plt.title("Original distribution of Alcohol level in the entire dataset")
seed = np.arange(0, 2)
x_bar = []
for s in seed:
np.random.seed(s)
sample_7_and_above = data.sample(10)[['alcohol']].reset_in-
dex().drop(columns = ['index'])
x_bar.append(np.mean(sample_7_and_above["alcohol"]))
ax = fig.add_subplot(3, 2, 2)
sns.distplot(x_bar)
plt.title("Sample means of 2 samples of 10 each")
seed = np.arange(0, 7)
x_bar = []
for s in seed:
np.random.seed(s)
sample_7_and_above = data.sample(10)[['alcohol']].reset_in-
dex().drop(columns = ['index'])
x_bar.append(np.mean(sample_7_and_above["alcohol"]))
ax = fig.add_subplot(3, 2, 3)
sns.distplot(x_bar)
plt.title("Sample means of 7 samples of 10 each")
ax = fig.add_subplot(3, 2, 4)
sns.distplot(x_bar)
plt.title("Sample means of 20 samples of 10 each")
ax = fig.add_subplot(3, 2, 5)
sns.distplot(x_bar)
plt.title("100 samples of 10 each")
ax = fig.add_subplot(3, 2, 6)
sns.distplot(x_bar)
plt.title("500 samples of 10 each")
fig.tight_layout()
plt.show()
Output:-
#Example no.6
np.random.seed(11)
sample_7_and_above = data[data['quality'].isin([7, 8])].sample(13)[['alcohol']].re-
set_index().drop(columns = ['index'])
sample_7_and_above
Output:-
Hypothesis testing in Machine learning using Python
#Example no.1
from scipy.stats import ttest_1samp
import numpy as npages = np.genfromtxt(“ages.csv”)print(ages)ages_mean =
np.mean(ages)
print(ages_mean)
tset, pval = ttest_1samp(ages, 30)print(“p-values”,pval)if pval < 0.05: # alpha value
is 0.05 or 5%
print(" we are rejecting null hypothesis")
else:
print("we are accepting null hypothesis")
Output:-
import pandas as pd
from scipy import stats
df = pd.read_csv("blood_pressure.csv")df[['bp_before','bp_after']].describe()ttest,pval
= stats.ttest_rel(df['bp_before'], df['bp_after'])
print(pval)if pval<0.05:
print("reject null hypothesis")
else:
print("accept null hypothesis")
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats as stestsztest ,pval =
stests.ztest(df['bp_before'], x2=None, value=156)
print(float(pval))if pval<0.05:
print("reject null hypothesis")
else:
print("accept null hypothesis")