R Notes For Data Analysis and Statistical Inference
R Notes For Data Analysis and Statistical Inference
Introduction to Data
# Print the number of rows and variables with the 'dim' function:
dim(present)
# Print the names of the variables of the data frame:
names(present)
# Find the number of boys born each year, and assign your answer to
num_boys <- present$boys
# Type here the code to create the plot(x,y):
plot(present$year,present$girls)
# Create the [line]plot here:
plot(present$year,present$girls,type="l")
# Check when boys outnumber girls
present$boys > present$girls
# Plot the boy-to-girl ratio for every year:
plot(present$year,present$boys/present$girls)
# Load the cdc data frame into the workspace:
load(url("http://assets.datacamp.com/course/dasi/cdc.Rdata"))
# Print the head and tails of the data frame:
head(cdc)
tail(cdc)
# You can use functions mean, var and median to calculate the (surprise, surprise)
mean, variance and median of certain variables of your data frame.
# The function summary() returns a numerical summary: minimum, first
quartile, median,mean, third quartile, and maximum.
mean(cdc$weight)
var(cdc$weight)
median(cdc$weight)
summary(cdc$weight)
# Create the frequency table here:
table(cdc$genhlth)
# Create the relative frequency table here:
table(cdc$genhlth)/20000
# Draw the barplot:
barplot(table(cdc$smoke100))
# Plot the mosaicplot:
mosaicplot(gender_smokers)
Assign the height of the 1337th respondent to height_1337 using the row-and-column notation.
(Use names to see what the index of height is.)Assign the weight of the 111th respondent
to weight_111 using the row-and-column notation.
# Create the subsets:
height_1337 <- cdc[1337,5]
weight_111 <- cdc[111,6]
# Print the results:
height_1337
weight_111
# Create the subsets:
first8 <- cdc[1:8,3:5]
wt_gen_10_20 <- cdc[10:20,6:9]
# Print the subsets:
first8
wt_gen_10_20
# Draw the box plot of the respondents heights:
boxplot(cdc$height)
# Draw the box plot of the weights versus smoking:
boxplot(cdc$weight ~ cdc$smoke100)
# Calculate the BMI:
bmi <- (cdc$weight/(cdc$height^2))*703
# Draw the box plot:
boxplot(bmi~cdc$genhlth)
# Draw a histogram of bmi:
hist(bmi)
# And one with breaks set to 50:
hist(bmi, breaks=50)
# And one with breaks set to 100:
hist(bmi, breaks=100)
Probability
# Print the first 9 values of the 'basket' variable
kobe$basket[1:9]
# Try some simulations!
outcomes <- c("heads", "tails")
sample(outcomes, size=1, replace=TRUE)
sample(outcomes, size=1, replace=TRUE)
sample(outcomes, size=1, replace=TRUE)
# Run the simulation:
outcomes <- c("heads", "tails")
sim_fair_coin <- sample(outcomes, size=100, replace=TRUE)
# Print the object:
sim_fair_coin
# Compute the counts of heads and tails:
table(sim_fair_coin)
# Run the simulation:
outcomes <- c("heads", "tails")
sim_unfair_coin <- sample(outcomes, size=100, replace=TRUE, prob=c(.2,.8))
# Print the object:
sim_unfair_coin
# Compute the counts of heads and tails:
table(sim_unfair_coin)
# Run the simulation and assign the result to 'sim_basket'.
outcomes <- c("H", "M")
sim_basket <- sample(outcomes, size=133, replace=TRUE, prob=c(.45,.55))
sim_basket
table(sim_basket)
# Calculate streak lengths:
kobe_streak <- calc_streak(kobe$basket)
sim_streak <- calc_streak(sim_basket)
kobe_streak
sim_streak
# Compute summaries:
summary(kobe_streak)
summary(sim_streak)
# Make bar plots:
kobe_table=table(kobe_streak)
sim_table=table(sim_streak)
barplot(kobe_table)
barplot(sim_table)
Foundations for Inference: Sampling Distributions
# Create the samples:
samp0 <- sample(area, 50)
samp1 <- sample(area, 50)
# Draw the histograms:
hist(samp0)
hist(samp1)
The for loop
# Set up an empty vector of 5000 NAs to store sample means:
sample_means50 <- rep(NA, 5000)
# Take 5000 samples of size 50 of 'area' and store all of them in 'sample_means50'.
for (i in 1:5000) {
samp <- sample(area, 50)
sample_means50[i] <- mean(samp)
}
# View the result. If you want, you can increase the bin width to show more detail by changing
the 'breaks' argument.
hist(sample_means50, breaks = 13)
In the case above, we wanted to iterate the two lines of code inside the curly braces that take a
random sample of size 50 from area then save the mean of that sample into
the sample_means50 vector. Without the for loop, this would be painful.
1. In the first line we initialize a vector. In this case, we created a vector of 5000 NAs called
sample means50. This vector will store values generated within the for
loop. NA means not available, and in this case they're used as placeholders until we fill
in the values with actual sample means. NA is also often used for missing data in R.
2. The second line calls the for loop itself. The syntax can be loosely read as, for every
element i from 1 to 5000, run the following lines of code. You can think of i as the
counter that keeps track of which loop you're on. Therefore, more precisely, the loop will
run once when i=1, then once when i=2, and so on up to i=5000.
3. The body of the for loop is the part inside the curly braces, and this set of code is run for
each value of i. Here, on every loop, we take a random sample of size 50 from area,
take its mean, and store it as the ith element of sample_means50. In order to display
that this is really happening, we asked R to print it at each iteration. This line of code is
optional and is only used for displaying what's going on while the for loop is running.
# The vector 'sample_means50' is initialized with NA values
sample_means50 <- rep(NA, 5000)
# The for loop runs 5000 times, with 'i' taking values 1 up to 5000
for (i in 1:5000) {
# Take a random sample of size 50
samp <- sample(area, 50)
# Store the mean of the sample in the 'sample_means50' vector on the ith place
sample_means50[i] <- mean(samp)
# Print the counter 'i'
print(i)
}
# Print the first few random means
head(sample_means50)
# Initialize the vector to store the means in:
sample_means_small <- rep(NA, 100)
# Run your for loop:
for (i in 1:100) {
samp=sample(area, 50)
sample_means_small[i]=mean(samp)}
# Print the result:
sample_means_small
# Initialize the sample distributions:
sample_means10 <- rep(NA, 5000)
sample_means100 <- rep(NA, 5000)
# Run the for loop:
for (i in 1:5000) {
samp <- sample(area, 10)
sample_means10[i] <- mean(samp)
samp <- sample(area, 100)
sample_means100[i] <- mean(samp)
}
summary(m5)$adj.r.squared
# Remove cls_perc_eval:
m6 <- lm(score ~ rank + ethnicity + gender + language + age + cls_students + cls_level +
cls_profs + cls_credits + bty_avg, data = evals)
summary(m6)$adj.r.squared
# Remove cls_students:
m7 <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_level +
cls_profs + cls_credits + bty_avg, data = evals)
summary(m7)$adj.r.squared
# Remove cls_level:
m8 <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students +
cls_profs + cls_credits + bty_avg, data = evals)
summary(m8)$adj.r.squared
# Remove cls_profs:
m9 <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students +
cls_level + cls_credits + bty_avg, data = evals)
summary(m9)$adj.r.squared
# Remove cls_credits:
m10 <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students +
cls_level + cls_profs + bty_avg, data = evals)
summary(m10)$adj.r.squared
# Remove bty_avg:
m11 <- lm(score ~ rank + ethnicity + gender + language + age + cls_perc_eval + cls_students +
cls_level + cls_profs + cls_credits, data = evals)
summary(m11)$adj.r.squared