0% found this document useful (0 votes)
2 views

Exp 4 Statistical Data Analysis With Python Sdk Ok

The document contains a series of Python code examples demonstrating concepts in probability, statistics, and data visualization using libraries like NumPy, Pandas, Matplotlib, and Seaborn. It covers topics such as calculating probabilities, permutations, combinations, generating random numbers, and performing hypothesis testing. Additionally, it includes examples of visualizing data distributions and empirical cumulative distribution functions (ECDF).

Uploaded by

gmranuj
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
2 views

Exp 4 Statistical Data Analysis With Python Sdk Ok

The document contains a series of Python code examples demonstrating concepts in probability, statistics, and data visualization using libraries like NumPy, Pandas, Matplotlib, and Seaborn. It covers topics such as calculating probabilities, permutations, combinations, generating random numbers, and performing hypothesis testing. Additionally, it includes examples of visualizing data distributions and empirical cumulative distribution functions (ECDF).

Uploaded by

gmranuj
Copyright
© © All Rights Reserved
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 18

%Name : _______________

%Class :SE Branch :E&TC


%Roll no : ___________ Subject: DAL
Experiment No:04
1.Probability and Statistics with Python
Example no :-1
# Sample Space
cards = 52

# Outcomes
aces = 4

# Divide possible outcomes by the sample set


ace_probability = aces / cards

# Print probability rounded to two decimal places


print(round(ace_probability, 2))

# Ace Probability Percent Code


ace_probability_percent = ace_probability * 100

# Print probability percent rounded to one decimal place


print(str(round(ace_probability_percent, 0)) + '%')

Output:-
0.08
8.0%

Example no.02
# Create function that returns probability percent rounded to one decimal place
def event_probability(event_outcomes, sample_space):
probability = (event_outcomes / sample_space) * 100
return round(probability, 1)

# Sample Space
cards = 52
# Determine the probability of drawing a heart
hearts = 13
heart_probability = event_probability(hearts, cards)

# Determine the probability of drawing a face card


face_cards = 12
face_card_probability = event_probability(face_cards, cards)

# Determine the probability of drawing the queen of hearts


queen_of_hearts = 1
queen_of_hearts_probability = event_probability(queen_of_hearts, cards)

# Print each probability


print("Probability of Heart :- ",str(heart_probability) + '%')
print("Probability of Face Card :- ",str(face_card_probability) + '%')
print("Probability of Queen of Hearts :- ",str(queen_of_hearts_probability) + '%'

Output:-
Probability of Heart :- 25.0%
Probability of Face Card :- 23.1%
Probability of Queen of Hearts :- 1.9%

Permutations
Example no.3
# Permutations Code
import math
n=4
k=2

# Determine permutations and print result


Permutations = math.factorial(n) / math.factorial(k)
print(Permutations)

Output:-
12.0

Example no.4
# Combinations Code
n = 52
k=2
# Determine Permutations
Permutations = math.factorial(n) / math.factorial(n - k)
# Determine Combinations and print result
Combinations = Permutations / math.factorial(k)
print(Combinations)

Output:-
1326.0

Generating random numbers using the np.random module


#Example no.5
# Seed the random number generator
# Initialize random numbers: random_numbers
random_numbers = np.empty(100000)

# Generate random numbers by looping over range(100000)


for i in range(100000):
random_numbers[i] = np.random.random()

# Plot a histogram
_ = plt.hist(random_numbers)

# Show the plot


plt.show()

Output:-
Plotting a Histogram of Iris Data
#Example no.06
# Import plotting modules
import matplotlib.pyplot as plt
import seaborn as sns

iris = pd.read_csv('../input/iris.data.csv')

print(iris.head())

#Create 3 DataFrame for each Species


setosa = iris[iris.iloc[:,4]=='Iris-setosa']
versicolor = iris[iris.iloc[:,4]=='Iris-versicolor']
virginica = iris[iris.iloc[:,4]=='Iris-virginica']
versicolor_petal_length = versicolor.iloc[:,0]
setosa_petal_length = setosa.iloc[:,0]
virginica_petal_length = virginica.iloc[:,0]
# Set default Seaborn style
sns.set()

# Plot histogram of versicolor petal lengths


plt.hist(versicolor_petal_length)
# Show histogram
plt.show()

Output:-
5.1 3.5 1.4 0.2 Iris-setosa
0 4.9 3.0 1.4 0.2 Iris-setosa
1 4.7 3.2 1.3 0.2 Iris-setosa
2 4.6 3.1 1.5 0.2 Iris-setosa
3 5.0 3.6 1.4 0.2 Iris-setosa
4 5.4 3.9 1.7 0.4 Iris-setosa
Computing the ECDF
#Example no.7
# Compute ECDF for versicolor data: x_vers, y_vers
x_vers, y_vers = ecdf(versicolor_petal_length)
# Generate plot
plt.plot(x_vers,y_vers, marker='.',linestyle='none')
# Label the axes
plt.xlabel('Petal Length')
plt.ylabel('ECDF')
# Display the plot
plt.show()

Output:-
#Example no.8
# Compute ECDFs
x_set, y_set = ecdf(setosa_petal_length)
x_vers, y_vers = ecdf(versicolor_petal_length)
x_virg, y_virg = ecdf(virginica_petal_length)

# Plot all ECDFs on the same plot


plt.plot(x_set,y_set, marker='.',linestyle='none')
plt.plot(x_vers,y_vers, marker='.',linestyle='none')
plt.plot(x_virg,y_virg, marker='.',linestyle='none')

# Annotate the plot


plt.legend(('setosa', 'versicolor', 'virginica'), loc='lower right')
_ = plt.xlabel('petal length (cm)')
_ = plt.ylabel('ECDF')

# Display the plot


plt.show()

Output:-
The np.random module and Bernoulli trial
#Example no.9
# Seed random number generator
np.random.seed(42)

# Initialize the number of defaults: n_defaults


n_defaults = np.empty(1000)

# Compute the number of defaults


for i in range(1000):
n_defaults[i] = perform_bernoulli_trials(100,0.05)

# Plot the histogram with default number of bins; label your axes
_ = plt.hist(n_defaults, normed=True)
_ = plt.xlabel('number of defaults out of 100 loans')
_ = plt.ylabel('probability')

# Show the plot


plt.show()
Output:-
#Example no.10
# Compute ECDF: x, y
x,y=ecdf(n_defaults)
# Plot the ECDF with labeled axes
plt.plot(x,y, marker='.',linestyle='none')
plt.xlabel('ECDF')
plt.ylabel('Probability')
# Show the plot
plt.show()
# Compute the number of 100-loan simulations with 10 or more defaults:
n_lose_money
n_lose_money=np.sum(n_defaults>=10)
# Compute and print probability of losing money
print('Probability of losing money =', n_lose_money / len(n_defaults))

Output:-
#Example no.11
# Draw 100000 samples from Normal distribution with stds of interest:
samples_std1, samples_std3, samples_std10
samples_std1=np.random.normal(20, 1, size=100000)
samples_std3=np.random.normal(20, 3, size=100000)
samples_std10=np.random.normal(20, 10, size=100000)

# Make histograms
plt.hist(samples_std1,normed=True, bins=100, histtype='step')
plt.hist(samples_std3,normed=True, bins=100, histtype='step')
plt.hist(samples_std10,normed=True, bins=100, histtype='step')
# Make a legend, set limits and show plot
_ = plt.legend(('std = 1', 'std = 3', 'std = 10'))
plt.ylim(-0.01, 0.42)
plt.show()

Output:-
Sampling & Sample Distribution

#Example no.1
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns


import matplotlib.pyplot as plt
data = pd.read_csv("/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-
red.csv")
data.head()

Output:-

#Example no.2
np.random.seed(11)
sample_7_and_above = data[data['quality'].isin([7, 8])].sample(10)[['alcohol']].re-
set_index().drop(columns = ['index'])
sample_7_and_above
Output:-

#Example no.3
sns.distplot(sample_7_and_above["alcohol"], hist=False)
title = "X_bar_1 = " + str(x_bar) + ", s1 = "+ str(s)
plt.title(title)

Output:-
Sampling distribution

#Example no.4
seed = np.arange(0, 9)

x_bar = []
std_dev = []

for s in seed:
np.random.seed(s)
sample_7_and_above = data[data[‘quality’].isin([7, 8])].sample(10)[[‘alco-
hol’]].reset_index().drop(columns = [‘index’])
x_bar.append(np.mean(sample_7_and_above[“alcohol”]))
std_dev.append(np.std(sample_7_and_above[“alcohol”]))

samples = pd.DataFrame(columns = [“Sample Means (X_bar)”, “Sample Stand-


ard Deviation (s)”], data= list(zip(x_bar, std_dev)))
samples

Output:-

#Example no.5
sns.distplot(samples["Sample Means (X_bar)"])
plt.title("Distribution of the sample means")

Output:-

#Example no.5
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(12, 10))

ax = fig.add_subplot(3, 2, 1)
sns.distplot(data["alcohol"])
plt.title("Original distribution of Alcohol level in the entire dataset")

seed = np.arange(0, 2)
x_bar = []
for s in seed:
np.random.seed(s)
sample_7_and_above = data.sample(10)[['alcohol']].reset_in-
dex().drop(columns = ['index'])
x_bar.append(np.mean(sample_7_and_above["alcohol"]))

ax = fig.add_subplot(3, 2, 2)
sns.distplot(x_bar)
plt.title("Sample means of 2 samples of 10 each")

seed = np.arange(0, 7)
x_bar = []
for s in seed:
np.random.seed(s)
sample_7_and_above = data.sample(10)[['alcohol']].reset_in-
dex().drop(columns = ['index'])
x_bar.append(np.mean(sample_7_and_above["alcohol"]))

ax = fig.add_subplot(3, 2, 3)
sns.distplot(x_bar)
plt.title("Sample means of 7 samples of 10 each")

seed = np.arange(0, 20)


x_bar = []
for s in seed:
np.random.seed(s)
sample_7_and_above = data.sample(10)[['alcohol']].reset_in-
dex().drop(columns = ['index'])
x_bar.append(np.mean(sample_7_and_above["alcohol"]))

ax = fig.add_subplot(3, 2, 4)
sns.distplot(x_bar)
plt.title("Sample means of 20 samples of 10 each")

seed = np.arange(0, 100)


x_bar = []
for s in seed:
np.random.seed(s)
sample_7_and_above = data.sample(10)[['alcohol']].reset_in-
dex().drop(columns = ['index'])
x_bar.append(np.mean(sample_7_and_above["alcohol"]))

ax = fig.add_subplot(3, 2, 5)
sns.distplot(x_bar)
plt.title("100 samples of 10 each")

seed = np.arange(0, 500)


x_bar = []
for s in seed:
np.random.seed(s)
sample_7_and_above = data.sample(10)[['alcohol']].reset_in-
dex().drop(columns = ['index'])
x_bar.append(np.mean(sample_7_and_above["alcohol"]))

ax = fig.add_subplot(3, 2, 6)
sns.distplot(x_bar)
plt.title("500 samples of 10 each")

fig.tight_layout()
plt.show()

Output:-

#Example no.6
np.random.seed(11)
sample_7_and_above = data[data['quality'].isin([7, 8])].sample(13)[['alcohol']].re-
set_index().drop(columns = ['index'])
sample_7_and_above

Output:-
Hypothesis testing in Machine learning using Python
#Example no.1
from scipy.stats import ttest_1samp
import numpy as npages = np.genfromtxt(“ages.csv”)print(ages)ages_mean =
np.mean(ages)
print(ages_mean)
tset, pval = ttest_1samp(ages, 30)print(“p-values”,pval)if pval < 0.05: # alpha value
is 0.05 or 5%
print(" we are rejecting null hypothesis")
else:
print("we are accepting null hypothesis")

Output:-

import pandas as pd
from scipy import stats
df = pd.read_csv("blood_pressure.csv")df[['bp_before','bp_after']].describe()ttest,pval
= stats.ttest_rel(df['bp_before'], df['bp_after'])
print(pval)if pval<0.05:
print("reject null hypothesis")
else:
print("accept null hypothesis")
import pandas as pd
from scipy import stats
from statsmodels.stats import weightstats as stestsztest ,pval =
stests.ztest(df['bp_before'], x2=None, value=156)
print(float(pval))if pval<0.05:
print("reject null hypothesis")
else:
print("accept null hypothesis")

You might also like