Hariks
Hariks
import matplotlib
import matplotlib.pyplot as plots
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
baby = pd.read_csv('baby.csv')
smoking_and_birthweight = baby[['Maternal Smoker', 'Birth Weight']]
# Separate birth weights for smokers and non-smokers
smoker = smoking_and_birthweight['Birth Weight'][smoking_and_birthweight['Maternal Smoker'] == True]
non_smoker = smoking_and_birthweight['Birth Weight'][smoking_and_birthweight['Maternal Smoker'] == False]
# Plot histogram
smoking_and_birthweight.hist(by='Maternal Smoker')
# Histogram for smokers and non-smokers
smoker.hist(histtype='stepfilled', alpha=.5, bins=20)
non_smoker.hist(histtype='stepfilled', alpha=.5, color=sns.desaturate("indianred", .75), bins=10)
# Labels
plt.xlabel('Women', fontsize=15)
plt.ylabel('Baby weight', fontsize=15)
plt.show()
# Group by maternal smoking and calculate mean birth weights
means_table = smoking_and_birthweight.groupby('Maternal Smoker').mean()
observed_difference = means_table['Birth Weight'].iloc[1] - means_table['Birth Weight'].iloc[0]
# Shuffle birth weights and calculate differences between group means
shuffled = smoking_and_birthweight.sample(1174, replace=False)
shuffled_weights = shuffled['Birth Weight']
original_and_shuffled = smoking_and_birthweight.assign(shuffled_weights=shuffled_weights.values)
# Calculate group means and differences
all_group_means = original_and_shuffled.groupby('Maternal Smoker').mean()
difference = all_group_means['shuffled_weights'].iloc[0] - all_group_means['shuffled_weights'].iloc[1]
# Permutation testing to calculate differences
differences = np.zeros(5000)
for i in np.arange(5000):
smoking_and_birthweight = baby[['Maternal Smoker', 'Birth Weight']]
shuffled = smoking_and_birthweight.sample(1174, replace=False)
shuffled_weights = shuffled['Birth Weight']
original_and_shuffled = smoking_and_birthweight.assign(shuffled_weights=shuffled_weights.values)
all_group_means = original_and_shuffled.groupby('Maternal Smoker').mean()
difference = all_group_means['shuffled_weights'].iloc[0] - all_group_means['shuffled_weights'].iloc[1]
differences[i] = difference
# Convert differences to DataFrame and plot histogram
differences_df = pd.DataFrame(differences)
differences_df.hist(bins=np.arange(-5, 5, 0.5))
plt.title('Prediction Under Null Hypotheses')
plt.xlabel('Differences between Group Averages', fontsize=15)
plt.ylabel('Units', fontsize=15)
plt.show()
# Output observed difference and proportion of differences less than or equal to the observed difference
print('Observed Difference:', observed_difference)
np.count_nonzero(differences <= observed_difference) / differences.size
EMPRICAL
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
# Jury data
jury = {"Ethnicity": ["Asian", "Black", "Latino", "White", "Other"],
"Eligible": [0.15, 0.18, 0.12, 0.54, 0.01],
"Panels": [0.26, 0.08, 0.08, 0.54, 0.04]}
jury
# Create DataFrame
Alameda_df = pd.DataFrame(jury)
Alameda_df
# Set index to 'Ethnicity'
Alameda_df_1 = Alameda_df.set_index('Ethnicity')
Alameda_df_1
# Plot horizontal bar chart
Alameda_df_1.plot.barh()
plt.ylabel('Ethnicity')
plt.gca().invert_yaxis()
# Calculate differences and absolute differences
Alameda_df_1['jury_with_diffs'] = Alameda_df_1['Panels'] - Alameda_df_1['Eligible']
Alameda_df_1
Alameda_df_1['Abs.Difference'] = abs(Alameda_df_1['jury_with_diffs'])
Alameda_df_1
# Calculate test statistic
test_statistic = Alameda_df_1['Abs.Difference'].sum() / 2
test_statistic
# Function to calculate total variation distance (TVD)
def total_variation_distance(distribution_1, distribution_2):
return np.abs(distribution_1 - distribution_2).sum() / 2
# Function to calculate TVD between two columns of a DataFrame
def table_tvd(table, label, other):
return total_variation_distance(table[label], table[other])
# Calculate observed statistic
observed_stat = table_tvd(Alameda_df, 'Eligible', 'Panels')
print(observed_stat)
# Simulate random sampling
panel_size = 1453
import numpy.random as npr
np.random.multinomial(1453, [0.15, 0.18, 0.12, 0.54, 0.01])
# Create new DataFrame with random sample
Alameda_df_2 = pd.DataFrame(Alameda_df_1, columns=['Eligible', 'Panels'])
Alameda_df_2['Random Sample'] = np.random.multinomial(1453, [0.15, 0.18, 0.12, 0.54, 0.01]) / 1453
Alameda_df_2
# Plot updated DataFrame
Alameda_df_2.plot.barh()
plt.ylabel('Ethnicity')
plt.gca().invert_yaxis()
# Calculate TVD for random sample
TVD = (abs(Alameda_df_2['Eligible'] - Alameda_df_2['Random Sample'])).sum() / 2
TVD
# Run TVD simulations
simulations = 5000
tvd_list = []
for i in np.arange(simulations):
Alameda_df_2["Random Sample"] = (npr.multinomial(1453, [0.15, 0.18, 0.12, 0.54, 0.01])) / panel_size
tvd_list.append(table_tvd(Alameda_df_2, 'Eligible', 'Random Sample'))
# Create DataFrame of TVD simulations
tvd_final_df = pd.DataFrame(tvd_list)
tvd_final_df.rename(columns={0: "TVD"}, inplace=True) # Renaming column
tvd_final_df.head()
# Plot histogram of TVD simulations
tvd_final_df.hist(bins=np.arange(0, 0.2, 0.005))
plt.ylabel('Percent per unit')
plt.xlabel('TVD')
# Plot observed statistic on the histogram
plt.scatter(observed_stat, 0, color='red', s=30)
CASUALITY
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load the data
bta = pd.read_csv('bta.csv')
bta
# Group by 'Group' and calculate mean
bta_table = bta.groupby('Group').mean()
# Calculate observed difference in results between the two groups
observed_difference = (bta_table["Result"].iloc[1] - bta_table["Result"].iloc[0])
observed_difference
# Shuffle the data
shuffled = bta.sample(31, replace=False)
shuffled
# Get shuffled 'Result' column
bta_shuffled_results = shuffled['Result']
type(bta_shuffled_results)
# Assign shuffled results to original data
original_and_shuffled = bta.assign(bta_shuffled_results=bta_shuffled_results.values)
original_and_shuffled
# Group by 'Group' and calculate means of shuffled data
all_group_means = original_and_shuffled.groupby('Group').mean()
all_group_means
# Calculate the distance between group means for shuffled data
distance = np.absolute(all_group_means['bta_shuffled_results'].iloc[0] - all_group_means['bta_shuffled_results'].iloc[1])
# Initialize an array to store distances
distances = np.zeros(5000)
# Loop to calculate distances for 5000 shuffled samples
for i in np.arange(5000):
shuffled = bta.sample(31, replace=False)
bta_shuffled_results = shuffled['Result']
original_and_shuffled = bta.assign(bta_shuffled_results=bta_shuffled_results.values)
all_group_means = original_and_shuffled.groupby('Group').mean()
distance = np.absolute(all_group_means['bta_shuffled_results'].iloc[0] - all_group_means['bta_shuffled_results'].iloc[1])
distances[i] = distance
# Create a DataFrame of distances and plot histogram
distances_df = pd.DataFrame(distances)
distances_df
# Plot histogram of distances
distances_df.hist(bins=np.arange(0, 0.7, 0.1))
plt.title('Prediction Under Null Hypotheses')
plt.xlabel('Distance Under Null Hypothesis', fontsize=15)
plt.ylabel('Units', fontsize=15)
# Scatter plot showing observed difference
plt.scatter(observed_difference, 0, color='red', s=30)
# Calculate and print empirical p-value
empirical_P = np.count_nonzero(distances >= observed_difference) / distances.size
empirical_P
print('Observed Distance:', observed_difference)
print('Empirical P-value:', round(empirical_P, 2) * 100, '%')
GSI
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load the scores data
scores = pd.read_csv('scores_by_section.csv')
scores
# Group by section and calculate the mean
section_averages = scores.groupby(by="Section").mean()
section_averages.head(6)
# Extract section 3 average midterm score
sec_3_average = section_averages['Midterm'][3]
sec_3_average
# Count the number of entries in each section
scores.groupby('Section').count()
# Isolate midterm scores
scores_only = scores['Midterm']
scores_only
# Sample 27 scores without replacement
sampled_scores = scores_only.sample(n=27, replace=False)
sampled_scores
# Initialize array to store sample means
sample_means = np.zeros(5000)
# Loop to generate sample means
for i in np.arange(5000):
sampled_scores = scores_only.sample(n=27, replace=False)
pp = sampled_scores.mean()
sample_means[i] = pp
# Last element of the sample means array
sample_means[4999]
# Create DataFrame of sample means and plot histogram
sample_df = pd.DataFrame(sample_means)
sample_df.head(3)
sample_df.hist(bins=np.arange(10, 20, 1))
plt.title('Marks Distribution')
plt.xlabel('Sample Average', fontsize=15)
plt.ylabel('Units', fontsize=15)
# Highlight section 3 average on the histogram
plt.scatter(sec_3_average, 0, color='red', s=30)
# Size of the sample means array
sample_means.size
# Sample means
sample_means
# Proportion of sample means less than or equal to section 3 average
np.count_nonzero(sample_means <= sec_3_average) / sample_means.size