Sampling Chapter3
Sampling Chapter3
point estimates
SAMPLING IN R
Richie Cotton
Data Evangelist at DataCamp
Sample is number of rows
coffee_ratings %>% coffee_ratings %>%
slice_sample(n = 300) %>% slice_sample(prop = 0.25) %>%
nrow() nrow()
300 334
SAMPLING IN R
Various sample sizes
coffee_ratings %>% coffee_ratings %>%
summarize(mean_points = mean(total_cup_points)) %>% slice_sample(n = 10) %>%
pull(mean_points) summarize(mean_points = mean(total_cup_points)) %>%
pull(mean_points)
82.15 82.82
82.02 82.16
SAMPLING IN R
Relative errors
Population parameter
Point estimate
SAMPLING IN R
Relative error vs. sample size
ggplot(errors, aes(sample_size, relative_error)) +
geom_line() +
geom_smooth(method = "loess")
SAMPLING IN R
Let's practice!
SAMPLING IN R
Creating a sampling
distribution
SAMPLING IN R
Richie Cotton
Data Evangelist at DataCamp
Same code, different answer
coffee_ratings %>% coffee_ratings %>%
slice_sample(n = 30) %>% slice_sample(n = 30) %>%
summarize(mean_cup_points = mean(total_cup_points)) %>% summarize(mean_cup_points = mean(total_cup_points)) %>%
pull(mean_cup_points) pull(mean_cup_points)
83.33 82.59
82.16 82.25
SAMPLING IN R
Same code, 1000 times
mean_cup_points_1000 <- replicate( [1] 81.65 81.57 82.66 82.27 81.76 81.74 82.71
n = 1000, [8] 82.20 80.43 82.45 82.29 82.63 82.28 82.11
expr = coffee_ratings %>% [15] 82.14 81.72 81.97 82.58 81.78 82.47 81.73
slice_sample(n = 30) %>% [22] 82.78 82.14 82.39 81.69 82.36 82.64 82.68
summarize( [29] 82.56 82.14 82.72 82.43 81.68 82.74 82.80
mean_cup_points = mean(total_cup_points) [36] 82.12 82.31 81.02 82.83 81.71 82.25 82.11
) %>% [43] 82.76 82.26 81.57 82.00 81.75 81.47 81.99
pull(mean_cup_points) [50] 82.68 82.05 82.43 82.40 82.66 80.78 82.43
) ...
[967] 81.84 83.12 81.54 81.83 82.24 82.36 82.49
[974] 82.05 82.08 81.98 82.45 82.04 81.42 83.06
[981] 81.97 82.65 81.12 82.48 81.64 81.92 81.96
[988] 81.71 81.96 81.78 82.30 81.76 82.46 82.43
[995] 81.95 82.60 81.84 82.78 82.23 82.56
SAMPLING IN R
Preparing for plotting
library(tibble) # A tibble: 1,000 x 1
sample_means <- tibble( sample_mean
sample_mean = mean_cup_points_1000 <dbl>
) 1 83.3
2 82.6
3 82.2
4 82.2
5 81.7
6 81.6
7 82.7
8 82.3
9 81.8
10 81.7
# ... with 990 more rows
SAMPLING IN R
Distribution of sample means for size 30
ggplot(sample_means, aes(sample_mean)) +
geom_histogram(binwidth = 0.1)
SAMPLING IN R
Different sample sizes
Sample size 6 Sample size 150
SAMPLING IN R
Let's practice!
SAMPLING IN R
Approximate
sampling
distributions
SAMPLING IN R
Richie Cotton
Data Evangelist at DataCamp
4 dice
# A tibble: 1,296 x 4
die1 die2 die3 die4
<int> <int> <int> <int>
1 1 1 1 1
2 1 1 1 2
3 1 1 1 3
library(tidyr) 4 1 1 1 4
dice <- expand_grid( 5 1 1 1 5
die1 = 1:6, 6 1 1 1 6
die2 = 1:6, 7 1 1 2 1
die3 = 1:6, 8 1 1 2 2
die4 = 1:6 9 1 1 2 3
) 10 1 1 2 4
# ... with 1,286 more rows
SAMPLING IN R
Mean roll
dice <- expand_grid( # A tibble: 1,296 x 5
die1 = 1:6, die1 die2 die3 die4 mean_roll
die2 = 1:6, <int> <int> <int> <int> <dbl>
die3 = 1:6, 1 1 1 1 1 1
die4 = 1:6 2 1 1 1 2 1.25
) %>% 3 1 1 1 3 1.5
mutate( 4 1 1 1 4 1.75
mean_roll = (die1 + die2 + die3 + die4) / 4 5 1 1 1 5 2
) 6 1 1 1 6 2.25
7 1 1 2 1 1.25
8 1 1 2 2 1.5
9 1 1 2 3 1.75
10 1 1 2 4 2
# ... with 1,286 more rows
SAMPLING IN R
Exact sampling distribution
ggplot(dice, aes(factor(mean_roll))) +
geom_bar()
SAMPLING IN R
The number of outcomes increases fast
outcomes <- tibble(
n_dice = 1:100,
n_outcomes = 6 ^ n_dice
)
SAMPLING IN R
Simulating the mean of four dice rolls
SAMPLING IN R
Simulating the mean of four dice rolls
sample_means_1000 <- replicate( # A tibble: 1,000 x 1
n = 1000, sample_mean
expr = { <dbl>
four_rolls <- sample( 1 4
1:6, size = 4, replace = TRUE 2 4.5
) 3 2.5
mean(four_rolls) 4 3.75
} 5 3.75
) 6 4
7 3
) 10 4.25
# ... with 990 more rows
SAMPLING IN R
Approximate sampling distribution
ggplot(sample_means, aes(factor(sample_mean))) +
geom_bar()
SAMPLING IN R
Let's practice!
SAMPLING IN R
Standard errors and
the Central Limit
Theorem
SAMPLING IN R
Richie Cotton
Data Evangelist at DataCamp
Sampling distribution of mean cup points
SAMPLING IN R
Consequences of the central limit theorem
Averages of independent samples have approximately normal distributions.
As the sample size increases,
the distribution of the averages gets closer to being normally distributed, and
the width of the sampling distribution gets narrower.
SAMPLING IN R
Population & sampling distribution means
coffee_ratings %>% Sample size Mean sample mean
summarize(
5 82.1496
mean_cup_points = mean(total_cup_points)
) %>% 20 82.1610
pull(mean_cup_points)
80 82.1496
SAMPLING IN R
Population & sampling distribution standard deviations
coffee_ratings %>% Sample size Std dev sample mean
summarize( 5 1.1929
sd_cup_points = sd(total_cup_points)
20 0.6028
) %>%
pull(sd_cup_points) 80 0.2865
320 0.1304
2.68686
SAMPLING IN R
Population mean over square root sample size
Sample size Std dev sample mean Calculation Result
5 1.1929 2.68686 / sqrt(5) 1.2016
SAMPLING IN R
Let's practice!
SAMPLING IN R