0% found this document useful (0 votes)

20 views5 pages

Hariks

Uploaded by

kanish

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

20 views5 pages

Hariks

Uploaded by

kanish

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 5

AB

import matplotlib
import matplotlib.pyplot as plots
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
baby = pd.read_csv('baby.csv')
smoking_and_birthweight = baby[['Maternal Smoker', 'Birth Weight']]
# Separate birth weights for smokers and non-smokers
smoker = smoking_and_birthweight['Birth Weight'][smoking_and_birthweight['Maternal Smoker'] == True]
non_smoker = smoking_and_birthweight['Birth Weight'][smoking_and_birthweight['Maternal Smoker'] == False]
# Plot histogram
smoking_and_birthweight.hist(by='Maternal Smoker')
# Histogram for smokers and non-smokers
smoker.hist(histtype='stepfilled', alpha=.5, bins=20)
non_smoker.hist(histtype='stepfilled', alpha=.5, color=sns.desaturate("indianred", .75), bins=10)
# Labels
plt.xlabel('Women', fontsize=15)
plt.ylabel('Baby weight', fontsize=15)
plt.show()
# Group by maternal smoking and calculate mean birth weights
means_table = smoking_and_birthweight.groupby('Maternal Smoker').mean()
observed_difference = means_table['Birth Weight'].iloc[1] - means_table['Birth Weight'].iloc[0]
# Shuffle birth weights and calculate differences between group means
shuffled = smoking_and_birthweight.sample(1174, replace=False)
shuffled_weights = shuffled['Birth Weight']
original_and_shuffled = smoking_and_birthweight.assign(shuffled_weights=shuffled_weights.values)
# Calculate group means and differences
all_group_means = original_and_shuffled.groupby('Maternal Smoker').mean()
difference = all_group_means['shuffled_weights'].iloc[0] - all_group_means['shuffled_weights'].iloc[1]
# Permutation testing to calculate differences
differences = np.zeros(5000)
for i in np.arange(5000):
smoking_and_birthweight = baby[['Maternal Smoker', 'Birth Weight']]
shuffled = smoking_and_birthweight.sample(1174, replace=False)
shuffled_weights = shuffled['Birth Weight']
original_and_shuffled = smoking_and_birthweight.assign(shuffled_weights=shuffled_weights.values)
all_group_means = original_and_shuffled.groupby('Maternal Smoker').mean()
difference = all_group_means['shuffled_weights'].iloc[0] - all_group_means['shuffled_weights'].iloc[1]
differences[i] = difference
# Convert differences to DataFrame and plot histogram
differences_df = pd.DataFrame(differences)
differences_df.hist(bins=np.arange(-5, 5, 0.5))
plt.title('Prediction Under Null Hypotheses')
plt.xlabel('Differences between Group Averages', fontsize=15)
plt.ylabel('Units', fontsize=15)
plt.show()
# Output observed difference and proportion of differences less than or equal to the observed difference
print('Observed Difference:', observed_difference)
np.count_nonzero(differences <= observed_difference) / differences.size
EMPRICAL
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
# Jury data
jury = {"Ethnicity": ["Asian", "Black", "Latino", "White", "Other"],
"Eligible": [0.15, 0.18, 0.12, 0.54, 0.01],
"Panels": [0.26, 0.08, 0.08, 0.54, 0.04]}
jury
# Create DataFrame
Alameda_df = pd.DataFrame(jury)
Alameda_df
# Set index to 'Ethnicity'
Alameda_df_1 = Alameda_df.set_index('Ethnicity')
Alameda_df_1
# Plot horizontal bar chart
Alameda_df_1.plot.barh()
plt.ylabel('Ethnicity')
plt.gca().invert_yaxis()
# Calculate differences and absolute differences
Alameda_df_1['jury_with_diffs'] = Alameda_df_1['Panels'] - Alameda_df_1['Eligible']
Alameda_df_1
Alameda_df_1['Abs.Difference'] = abs(Alameda_df_1['jury_with_diffs'])
Alameda_df_1
# Calculate test statistic
test_statistic = Alameda_df_1['Abs.Difference'].sum() / 2
test_statistic
# Function to calculate total variation distance (TVD)
def total_variation_distance(distribution_1, distribution_2):
return np.abs(distribution_1 - distribution_2).sum() / 2
# Function to calculate TVD between two columns of a DataFrame
def table_tvd(table, label, other):
return total_variation_distance(table[label], table[other])
# Calculate observed statistic
observed_stat = table_tvd(Alameda_df, 'Eligible', 'Panels')
print(observed_stat)
# Simulate random sampling
panel_size = 1453
import numpy.random as npr
np.random.multinomial(1453, [0.15, 0.18, 0.12, 0.54, 0.01])
# Create new DataFrame with random sample
Alameda_df_2 = pd.DataFrame(Alameda_df_1, columns=['Eligible', 'Panels'])
Alameda_df_2['Random Sample'] = np.random.multinomial(1453, [0.15, 0.18, 0.12, 0.54, 0.01]) / 1453
Alameda_df_2
# Plot updated DataFrame
Alameda_df_2.plot.barh()
plt.ylabel('Ethnicity')
plt.gca().invert_yaxis()
# Calculate TVD for random sample
TVD = (abs(Alameda_df_2['Eligible'] - Alameda_df_2['Random Sample'])).sum() / 2
TVD
# Run TVD simulations
simulations = 5000
tvd_list = []
for i in np.arange(simulations):
Alameda_df_2["Random Sample"] = (npr.multinomial(1453, [0.15, 0.18, 0.12, 0.54, 0.01])) / panel_size
tvd_list.append(table_tvd(Alameda_df_2, 'Eligible', 'Random Sample'))
# Create DataFrame of TVD simulations
tvd_final_df = pd.DataFrame(tvd_list)
tvd_final_df.rename(columns={0: "TVD"}, inplace=True) # Renaming column
tvd_final_df.head()
# Plot histogram of TVD simulations
tvd_final_df.hist(bins=np.arange(0, 0.2, 0.005))
plt.ylabel('Percent per unit')
plt.xlabel('TVD')
# Plot observed statistic on the histogram
plt.scatter(observed_stat, 0, color='red', s=30)
CASUALITY
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load the data
bta = pd.read_csv('bta.csv')
bta
# Group by 'Group' and calculate mean
bta_table = bta.groupby('Group').mean()
# Calculate observed difference in results between the two groups
observed_difference = (bta_table["Result"].iloc[1] - bta_table["Result"].iloc[0])
observed_difference
# Shuffle the data
shuffled = bta.sample(31, replace=False)
shuffled
# Get shuffled 'Result' column
bta_shuffled_results = shuffled['Result']
type(bta_shuffled_results)
# Assign shuffled results to original data
original_and_shuffled = bta.assign(bta_shuffled_results=bta_shuffled_results.values)
original_and_shuffled
# Group by 'Group' and calculate means of shuffled data
all_group_means = original_and_shuffled.groupby('Group').mean()
all_group_means
# Calculate the distance between group means for shuffled data
distance = np.absolute(all_group_means['bta_shuffled_results'].iloc[0] - all_group_means['bta_shuffled_results'].iloc[1])
# Initialize an array to store distances
distances = np.zeros(5000)
# Loop to calculate distances for 5000 shuffled samples
for i in np.arange(5000):
shuffled = bta.sample(31, replace=False)
bta_shuffled_results = shuffled['Result']
original_and_shuffled = bta.assign(bta_shuffled_results=bta_shuffled_results.values)
all_group_means = original_and_shuffled.groupby('Group').mean()
distance = np.absolute(all_group_means['bta_shuffled_results'].iloc[0] - all_group_means['bta_shuffled_results'].iloc[1])
distances[i] = distance
# Create a DataFrame of distances and plot histogram
distances_df = pd.DataFrame(distances)
distances_df
# Plot histogram of distances
distances_df.hist(bins=np.arange(0, 0.7, 0.1))
plt.title('Prediction Under Null Hypotheses')
plt.xlabel('Distance Under Null Hypothesis', fontsize=15)
plt.ylabel('Units', fontsize=15)
# Scatter plot showing observed difference
plt.scatter(observed_difference, 0, color='red', s=30)
# Calculate and print empirical p-value
empirical_P = np.count_nonzero(distances >= observed_difference) / distances.size
empirical_P
print('Observed Distance:', observed_difference)
print('Empirical P-value:', round(empirical_P, 2) * 100, '%')
GSI

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
# Load the scores data
scores = pd.read_csv('scores_by_section.csv')
scores
# Group by section and calculate the mean
section_averages = scores.groupby(by="Section").mean()
section_averages.head(6)
# Extract section 3 average midterm score
sec_3_average = section_averages['Midterm'][3]
sec_3_average
# Count the number of entries in each section
scores.groupby('Section').count()
# Isolate midterm scores
scores_only = scores['Midterm']
scores_only
# Sample 27 scores without replacement
sampled_scores = scores_only.sample(n=27, replace=False)
sampled_scores
# Initialize array to store sample means
sample_means = np.zeros(5000)
# Loop to generate sample means
for i in np.arange(5000):
sampled_scores = scores_only.sample(n=27, replace=False)
pp = sampled_scores.mean()
sample_means[i] = pp
# Last element of the sample means array
sample_means[4999]
# Create DataFrame of sample means and plot histogram
sample_df = pd.DataFrame(sample_means)
sample_df.head(3)
sample_df.hist(bins=np.arange(10, 20, 1))
plt.title('Marks Distribution')
plt.xlabel('Sample Average', fontsize=15)
plt.ylabel('Units', fontsize=15)
# Highlight section 3 average on the histogram
plt.scatter(sec_3_average, 0, color='red', s=30)
# Size of the sample means array
sample_means.size
# Sample means
sample_means
# Proportion of sample means less than or equal to section 3 average
np.count_nonzero(sample_means <= sec_3_average) / sample_means.size

COMP5318
No ratings yet
COMP5318
42 pages
Jamboree
No ratings yet
Jamboree
56 pages
Data Science Lab Manual
No ratings yet
Data Science Lab Manual
32 pages
ML Lab
No ratings yet
ML Lab
24 pages
Code
No ratings yet
Code
25 pages
Data Science Fundamentals
No ratings yet
Data Science Fundamentals
22 pages
Advanced Statistical Methods Using R
No ratings yet
Advanced Statistical Methods Using R
32 pages
COST - JournalPracticals (1-7)
No ratings yet
COST - JournalPracticals (1-7)
22 pages
Stats
No ratings yet
Stats
33 pages
ADS Practical Exam Questions
No ratings yet
ADS Practical Exam Questions
14 pages
Python Codes Test 2
No ratings yet
Python Codes Test 2
12 pages
Dav Practicals
No ratings yet
Dav Practicals
33 pages
Data Science Manual
No ratings yet
Data Science Manual
16 pages
FDS Lab Question Bank
No ratings yet
FDS Lab Question Bank
11 pages
STAT456 Study Guide
No ratings yet
STAT456 Study Guide
31 pages
R Code
No ratings yet
R Code
13 pages
Data Science Practicals
No ratings yet
Data Science Practicals
47 pages
Mock Exam - Appendix
No ratings yet
Mock Exam - Appendix
15 pages
DALab Part-B BCU&BU
No ratings yet
DALab Part-B BCU&BU
12 pages
Data Science Practical Book - Ipynb
No ratings yet
Data Science Practical Book - Ipynb
21 pages
Pattern Recognition
No ratings yet
Pattern Recognition
26 pages
Project Paarth
No ratings yet
Project Paarth
21 pages
Algorithm M
No ratings yet
Algorithm M
8 pages
R Practice
No ratings yet
R Practice
38 pages
Linear and Multilinear Regression
No ratings yet
Linear and Multilinear Regression
5 pages
AD3411 DATA SCIENCE AND ANALYTICS LAB (2) - Removed
No ratings yet
AD3411 DATA SCIENCE AND ANALYTICS LAB (2) - Removed
24 pages
DA Manual - Part B
No ratings yet
DA Manual - Part B
13 pages
Lab 13
No ratings yet
Lab 13
5 pages
Cost Practical
No ratings yet
Cost Practical
13 pages
Machine File
No ratings yet
Machine File
27 pages
R Course
No ratings yet
R Course
7 pages
Data Science and Analtics Laboratory
No ratings yet
Data Science and Analtics Laboratory
21 pages
Data Science and Analtics Laboratory
No ratings yet
Data Science and Analtics Laboratory
21 pages
4 12
No ratings yet
4 12
17 pages
Healthcare-Project-Simplilearn - Week2
No ratings yet
Healthcare-Project-Simplilearn - Week2
8 pages
Edp 3
No ratings yet
Edp 3
16 pages
DVA Lab Manual
No ratings yet
DVA Lab Manual
20 pages
AP Statistics Michel Liao
No ratings yet
AP Statistics Michel Liao
20 pages
Batch-2 Ieee DMT
No ratings yet
Batch-2 Ieee DMT
4 pages
Vanshika Goyal Gec Practicals
No ratings yet
Vanshika Goyal Gec Practicals
31 pages
Commands For Data Analysis Using R
No ratings yet
Commands For Data Analysis Using R
11 pages
Healthcare-Project-Simplilearn - Week3
No ratings yet
Healthcare-Project-Simplilearn - Week3
7 pages
Gec Practicals
No ratings yet
Gec Practicals
31 pages
Bda Assign
No ratings yet
Bda Assign
15 pages
R Cheatsheet ABCD
No ratings yet
R Cheatsheet ABCD
3 pages
Dav Lab Manual
No ratings yet
Dav Lab Manual
28 pages
Ifm Group2 Code
No ratings yet
Ifm Group2 Code
7 pages
Fha-Pyhton Program Unit 1-4
No ratings yet
Fha-Pyhton Program Unit 1-4
13 pages
MLFILE
No ratings yet
MLFILE
21 pages
Data Science
No ratings yet
Data Science
15 pages
BAN5
No ratings yet
BAN5
2 pages
Data Preprocess Steps
No ratings yet
Data Preprocess Steps
2 pages
R Notes For Data Analysis and Statistical Inference
No ratings yet
R Notes For Data Analysis and Statistical Inference
10 pages
DSDBAAssignment2 SUMEET
No ratings yet
DSDBAAssignment2 SUMEET
8 pages
Solution8 12
No ratings yet
Solution8 12
12 pages
R Program Corrections
No ratings yet
R Program Corrections
20 pages
DSBDA Practicals
No ratings yet
DSBDA Practicals
16 pages
ADS Exp3 BE9 29
No ratings yet
ADS Exp3 BE9 29
5 pages
Final Notes
No ratings yet
Final Notes
3 pages
200+ Cloze Test For Practice
No ratings yet
200+ Cloze Test For Practice
14 pages
Field Guide For Investigating Internal Corrosion of Pipelines
100% (2)
Field Guide For Investigating Internal Corrosion of Pipelines
110 pages
DLP in Limits of Exponential and Logarithmic Functions
No ratings yet
DLP in Limits of Exponential and Logarithmic Functions
24 pages
PRAXIS ESP Packer Installation Instructions
No ratings yet
PRAXIS ESP Packer Installation Instructions
11 pages
Unang Panahunang Pagsusulit Sa Matematika, Filipino at English 2 2018-2019
No ratings yet
Unang Panahunang Pagsusulit Sa Matematika, Filipino at English 2 2018-2019
13 pages
SCULPFUN S30 Series User Manual
100% (1)
SCULPFUN S30 Series User Manual
72 pages
UR Software Manual
No ratings yet
UR Software Manual
153 pages
T Burke Litwin Model For Org Change
No ratings yet
T Burke Litwin Model For Org Change
24 pages
Schneider Solar Catalog 2015
No ratings yet
Schneider Solar Catalog 2015
65 pages
Updated Mathematics Syllabus For Regional and National Level MSM 2024
No ratings yet
Updated Mathematics Syllabus For Regional and National Level MSM 2024
3 pages
Datasheet k4101
No ratings yet
Datasheet k4101
5 pages
MAY 2017 USA: Test Answer Key Explanation & Analysis Curve
No ratings yet
MAY 2017 USA: Test Answer Key Explanation & Analysis Curve
92 pages
Base Plate Design: Units System: KN - M
No ratings yet
Base Plate Design: Units System: KN - M
10 pages
LNS HK-XXX Version 2012
No ratings yet
LNS HK-XXX Version 2012
2 pages
How The Polar Code Protects The Environment (English Infographic)
100% (1)
How The Polar Code Protects The Environment (English Infographic)
1 page
The Genius of Medicine
No ratings yet
The Genius of Medicine
22 pages
Simplified Dirac Delta
No ratings yet
Simplified Dirac Delta
18 pages
Ebook Moursund Games
No ratings yet
Ebook Moursund Games
157 pages
Usp - Despiro
No ratings yet
Usp - Despiro
2 pages
Certificate of Validation: Universidad de Zamboanga
No ratings yet
Certificate of Validation: Universidad de Zamboanga
7 pages
Introduction To Statistics
No ratings yet
Introduction To Statistics
36 pages
Priced Publication CCRH
No ratings yet
Priced Publication CCRH
3 pages
The New Geopolitics of Climate Change - The Diplomat
No ratings yet
The New Geopolitics of Climate Change - The Diplomat
5 pages
A Noteworthy Feature of The Organizational Settings Within Which Engineers Work Is
No ratings yet
A Noteworthy Feature of The Organizational Settings Within Which Engineers Work Is
5 pages
Gema Uv PP 2630 - TDS
No ratings yet
Gema Uv PP 2630 - TDS
1 page
Cmda - Web Enabled Landuse Information System
No ratings yet
Cmda - Web Enabled Landuse Information System
1 page
Nephelometry
No ratings yet
Nephelometry
22 pages
Nast - Dioscoro L. Umali Compilation of Speeches of National Scientist (Library) - 168
No ratings yet
Nast - Dioscoro L. Umali Compilation of Speeches of National Scientist (Library) - 168
8 pages
2 Bac Humour Functions Lack of Understanding
No ratings yet
2 Bac Humour Functions Lack of Understanding
1 page
Wild Is The Wind: Pathosformel and The Iconology of A Quintessence
No ratings yet
Wild Is The Wind: Pathosformel and The Iconology of A Quintessence
2 pages
Profound Python Data Science
From Everand
Profound Python Data Science
Onder Teker
No ratings yet

Hariks

Uploaded by

Hariks

Uploaded by

AB

You might also like