0% found this document useful (0 votes)
20 views5 pages

Hariks

Uploaded by

kanish
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
20 views5 pages

Hariks

Uploaded by

kanish
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 5

AB

import matplotlib
import matplotlib.pyplot as plots
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
baby = pd.read_csv('baby.csv')
smoking_and_birthweight = baby[['Maternal Smoker', 'Birth Weight']]
# Separate birth weights for smokers and non-smokers
smoker = smoking_and_birthweight['Birth Weight'][smoking_and_birthweight['Maternal Smoker'] == True]
non_smoker = smoking_and_birthweight['Birth Weight'][smoking_and_birthweight['Maternal Smoker'] == False]
# Plot histogram
smoking_and_birthweight.hist(by='Maternal Smoker')
# Histogram for smokers and non-smokers
smoker.hist(histtype='stepfilled', alpha=.5, bins=20)
non_smoker.hist(histtype='stepfilled', alpha=.5, color=sns.desaturate("indianred", .75), bins=10)
# Labels
plt.xlabel('Women', fontsize=15)
plt.ylabel('Baby weight', fontsize=15)
plt.show()
# Group by maternal smoking and calculate mean birth weights
means_table = smoking_and_birthweight.groupby('Maternal Smoker').mean()
observed_difference = means_table['Birth Weight'].iloc[1] - means_table['Birth Weight'].iloc[0]
# Shuffle birth weights and calculate differences between group means
shuffled = smoking_and_birthweight.sample(1174, replace=False)
shuffled_weights = shuffled['Birth Weight']
original_and_shuffled = smoking_and_birthweight.assign(shuffled_weights=shuffled_weights.values)
# Calculate group means and differences
all_group_means = original_and_shuffled.groupby('Maternal Smoker').mean()
difference = all_group_means['shuffled_weights'].iloc[0] - all_group_means['shuffled_weights'].iloc[1]
# Permutation testing to calculate differences
differences = np.zeros(5000)
for i in np.arange(5000):
smoking_and_birthweight = baby[['Maternal Smoker', 'Birth Weight']]
shuffled = smoking_and_birthweight.sample(1174, replace=False)
shuffled_weights = shuffled['Birth Weight']
original_and_shuffled = smoking_and_birthweight.assign(shuffled_weights=shuffled_weights.values)
all_group_means = original_and_shuffled.groupby('Maternal Smoker').mean()
difference = all_group_means['shuffled_weights'].iloc[0] - all_group_means['shuffled_weights'].iloc[1]
differences[i] = difference
# Convert differences to DataFrame and plot histogram
differences_df = pd.DataFrame(differences)
differences_df.hist(bins=np.arange(-5, 5, 0.5))
plt.title('Prediction Under Null Hypotheses')
plt.xlabel('Differences between Group Averages', fontsize=15)
plt.ylabel('Units', fontsize=15)
plt.show()
# Output observed difference and proportion of differences less than or equal to the observed difference
print('Observed Difference:', observed_difference)
np.count_nonzero(differences <= observed_difference) / differences.size
EMPRICAL
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
# Jury data
jury = {"Ethnicity": ["Asian", "Black", "Latino", "White", "Other"],
"Eligible": [0.15, 0.18, 0.12, 0.54, 0.01],
"Panels": [0.26, 0.08, 0.08, 0.54, 0.04]}
jury
# Create DataFrame
Alameda_df = pd.DataFrame(jury)
Alameda_df
# Set index to 'Ethnicity'
Alameda_df_1 = Alameda_df.set_index('Ethnicity')
Alameda_df_1
# Plot horizontal bar chart
Alameda_df_1.plot.barh()
plt.ylabel('Ethnicity')
plt.gca().invert_yaxis()
# Calculate differences and absolute differences
Alameda_df_1['jury_with_diffs'] = Alameda_df_1['Panels'] - Alameda_df_1['Eligible']
Alameda_df_1
Alameda_df_1['Abs.Difference'] = abs(Alameda_df_1['jury_with_diffs'])
Alameda_df_1
# Calculate test statistic
test_statistic = Alameda_df_1['Abs.Difference'].sum() / 2
test_statistic
# Function to calculate total variation distance (TVD)
def total_variation_distance(distribution_1, distribution_2):
return np.abs(distribution_1 - distribution_2).sum() / 2
# Function to calculate TVD between two columns of a DataFrame
def table_tvd(table, label, other):
return total_variation_distance(table[label], table[other])
# Calculate observed statistic
observed_stat = table_tvd(Alameda_df, 'Eligible', 'Panels')
print(observed_stat)
# Simulate random sampling
panel_size = 1453
import numpy.random as npr
np.random.multinomial(1453, [0.15, 0.18, 0.12, 0.54, 0.01])
# Create new DataFrame with random sample
Alameda_df_2 = pd.DataFrame(Alameda_df_1, columns=['Eligible', 'Panels'])
Alameda_df_2['Random Sample'] = np.random.multinomial(1453, [0.15, 0.18, 0.12, 0.54, 0.01]) / 1453
Alameda_df_2
# Plot updated DataFrame
Alameda_df_2.plot.barh()
plt.ylabel('Ethnicity')
plt.gca().invert_yaxis()
# Calculate TVD for random sample
TVD = (abs(Alameda_df_2['Eligible'] - Alameda_df_2['Random Sample'])).sum() / 2
TVD
# Run TVD simulations
simulations = 5000
tvd_list = []
for i in np.arange(simulations):
Alameda_df_2["Random Sample"] = (npr.multinomial(1453, [0.15, 0.18, 0.12, 0.54, 0.01])) / panel_size
tvd_list.append(table_tvd(Alameda_df_2, 'Eligible', 'Random Sample'))
# Create DataFrame of TVD simulations
tvd_final_df = pd.DataFrame(tvd_list)
tvd_final_df.rename(columns={0: "TVD"}, inplace=True) # Renaming column
tvd_final_df.head()
# Plot histogram of TVD simulations
tvd_final_df.hist(bins=np.arange(0, 0.2, 0.005))
plt.ylabel('Percent per unit')
plt.xlabel('TVD')
# Plot observed statistic on the histogram
plt.scatter(observed_stat, 0, color='red', s=30)
CASUALITY
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load the data
bta = pd.read_csv('bta.csv')
bta
# Group by 'Group' and calculate mean
bta_table = bta.groupby('Group').mean()
# Calculate observed difference in results between the two groups
observed_difference = (bta_table["Result"].iloc[1] - bta_table["Result"].iloc[0])
observed_difference
# Shuffle the data
shuffled = bta.sample(31, replace=False)
shuffled
# Get shuffled 'Result' column
bta_shuffled_results = shuffled['Result']
type(bta_shuffled_results)
# Assign shuffled results to original data
original_and_shuffled = bta.assign(bta_shuffled_results=bta_shuffled_results.values)
original_and_shuffled
# Group by 'Group' and calculate means of shuffled data
all_group_means = original_and_shuffled.groupby('Group').mean()
all_group_means
# Calculate the distance between group means for shuffled data
distance = np.absolute(all_group_means['bta_shuffled_results'].iloc[0] - all_group_means['bta_shuffled_results'].iloc[1])
# Initialize an array to store distances
distances = np.zeros(5000)
# Loop to calculate distances for 5000 shuffled samples
for i in np.arange(5000):
shuffled = bta.sample(31, replace=False)
bta_shuffled_results = shuffled['Result']
original_and_shuffled = bta.assign(bta_shuffled_results=bta_shuffled_results.values)
all_group_means = original_and_shuffled.groupby('Group').mean()
distance = np.absolute(all_group_means['bta_shuffled_results'].iloc[0] - all_group_means['bta_shuffled_results'].iloc[1])
distances[i] = distance
# Create a DataFrame of distances and plot histogram
distances_df = pd.DataFrame(distances)
distances_df
# Plot histogram of distances
distances_df.hist(bins=np.arange(0, 0.7, 0.1))
plt.title('Prediction Under Null Hypotheses')
plt.xlabel('Distance Under Null Hypothesis', fontsize=15)
plt.ylabel('Units', fontsize=15)
# Scatter plot showing observed difference
plt.scatter(observed_difference, 0, color='red', s=30)
# Calculate and print empirical p-value
empirical_P = np.count_nonzero(distances >= observed_difference) / distances.size
empirical_P
print('Observed Distance:', observed_difference)
print('Empirical P-value:', round(empirical_P, 2) * 100, '%')
GSI

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load the scores data
scores = pd.read_csv('scores_by_section.csv')
scores
# Group by section and calculate the mean
section_averages = scores.groupby(by="Section").mean()
section_averages.head(6)
# Extract section 3 average midterm score
sec_3_average = section_averages['Midterm'][3]
sec_3_average
# Count the number of entries in each section
scores.groupby('Section').count()
# Isolate midterm scores
scores_only = scores['Midterm']
scores_only
# Sample 27 scores without replacement
sampled_scores = scores_only.sample(n=27, replace=False)
sampled_scores
# Initialize array to store sample means
sample_means = np.zeros(5000)
# Loop to generate sample means
for i in np.arange(5000):
sampled_scores = scores_only.sample(n=27, replace=False)
pp = sampled_scores.mean()
sample_means[i] = pp
# Last element of the sample means array
sample_means[4999]
# Create DataFrame of sample means and plot histogram
sample_df = pd.DataFrame(sample_means)
sample_df.head(3)
sample_df.hist(bins=np.arange(10, 20, 1))
plt.title('Marks Distribution')
plt.xlabel('Sample Average', fontsize=15)
plt.ylabel('Units', fontsize=15)
# Highlight section 3 average on the histogram
plt.scatter(sec_3_average, 0, color='red', s=30)
# Size of the sample means array
sample_means.size
# Sample means
sample_means
# Proportion of sample means less than or equal to section 3 average
np.count_nonzero(sample_means <= sec_3_average) / sample_means.size

You might also like