0% found this document useful (0 votes)
18 views5 pages

CourseKata R Cheatsheet ABC

This cheat sheet provides a comprehensive overview of statistical methods and data science techniques, including word equations, summary tables, simple statistics, probability distributions, simulations, model fitting, and visualizations. It includes R code snippets for various operations such as computing means, creating frequency tables, performing t-tests, and generating plots. The document serves as a quick reference for statistical analysis and data visualization in R.

Uploaded by

issy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
18 views5 pages

CourseKata R Cheatsheet ABC

This cheat sheet provides a comprehensive overview of statistical methods and data science techniques, including word equations, summary tables, simple statistics, probability distributions, simulations, model fitting, and visualizations. It includes R code snippets for various operations such as computing means, creating frequency tables, performing t-tests, and generating plots. The document serves as a quick reference for statistical analysis and data visualization in R.

Uploaded by

issy
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 5

Statistics and Data Science I (ABC) CHEAT SHEET

https://bit.ly/r-cheatsheet-abc
Word Equations Summary Tables Simple Statistics
outcome = explanatory + other stuff # compute five-number summary mean(data_set$Y)
Y = X + other stuff favstats(~ Y, data = data_set) var(data_set$Y)
sd(data_set$Y)
# create frequency table
Basics tally(data_set$Y) cohensD(Y ~ X, data = data_set)
tally(~ Y, data = data_set) cor(Y ~ X, data = data_set)
print("Hello world!")
# tally by condition b1(Y ~ X, data = data_set)
# assign value to object b1(one_model)
tally(~ Y < 1900, data =
myNumber <- 5
data_set)
pre(Y ~ X, data = data_set)
# combine values into vector f(Y ~ X, data = data_set)
# two-way frequency table
myVector <- c(1, 2, 3)
tally(Y ~ X, data = data_set, margin =
TRUE, format = “proportion”)
# first element in vector
myVector[1]
Data Frame # arrange rows by variable
arrange(data_set, Y)
# orders values or cases # structure of data frame # arrange rows by variable in descending
sort(myVector) str(data_set) arrange(data_set, desc(Y))
# arithmetic operations
# view first/last six rows # creates data frame from csv file
sum(1, 2, 100), +, -, *, /
head(data_set) data_set <- read.csv("file_name", header =
sqrt(157)
abs(data_set$Y) tail(data_set) TRUE)

# logical operations # select multiple variables # convert quantitative variable


>, <, >=, <=, ==, !=, |, & select(data_set, Y1, Y2) # to categorical
factor(data_set$Y)
# results in a variable with values # first six rows of selected variables factor(data_set$Y, levels = c(1,2), labels
# of TRUE or FALSE head(select(data_set, Y1, Y2)) = c("A", "B"))
data_set$C <- data_set$A > data_set$B
# transform values
# select variable (a column) recode(data_set$Y, "0" = 0, "1" = 50, "2" =
data_set$Y 100)

# find rows that meet condition # creates two equal sized groups
data_set[data_set$Y > 40] ntile(data_set$Y, 2)
filter(data_set, Y > 300)
filter(data_set, Y != "NA") # convert categorical variable
# to quantitative
as.numeric(data_set$Y)
Probability Distribution
# CI using t distribution
# calculate the probability area confint(empty_model)
xpnorm(65.1, data_set$mean, data_set$sd)

# calculate a z-score # calculate p-value using F-distribution


zscore(data_set$Y) xpf(sample_F, df1 = 2 , df2 = 10)

# returns t at this probability


qt(.975, df = 999)
# returns F at this probability
qf(.95, df1 = 1, df2 = 100)

Page: 2 ▷ Updated: 2023-04 ▷ Learn more about CourseKata @ https://coursekata.org


Simulation Fitting and Evaluating Models
# randomize sampling distribution
# sample without replacement # of b1s, centered on 0 # empty model
sample(data_set, 6) sdob1 <- do(1000) * empty_model <- lm(Y ~ NULL,
b1(shuffle(Y) ~ X, data = data_set) data = data_set)
# sample with replacement
resample(data_set, 10) # bootstrap sampling distribution of # use one expanatory variable
b1s, one_model <- lm(Y ~ X, data = data_set)
do(3) * resample (data_set, 10) # centered on sample b1
sdob1_boot <- do(1000) * # create a function from a formula
# mixes up values in a variable b1(Y ~ X, data = resample(data_set)) one_model_fun <- makeFun(one_model)
shuffle(data_set$Y)
one_model_fun(x_level_1)
# count the number of b1s at the upper
# simulate sampling 10000 Ys # and lower extreme
# model predictions and residuals
# from normal distribution tally(sdob1$b1 > sample_b1 | data_set$empty_predict <-
sim_Y <- rnorm(10000, Y_stats$mean, sdob1$b1 < -sample_b1) predict(empty_model)
Y_stats$sd) data_set$empty_resid <-
resid(empty_model)
# return TRUE for middle 95% of
# put simulated Ys into dataframe
data_set<- data.frame(sim_Y) distribution
# produce ANOVA table
middle(sdob1$b1, .95)
anova(empty_model)
# simulate sampling distribution of supernova(one_model)
means # randomize sampling distribution of
sim_SDoM <- do(10000) * mean(rnorm(157, PREs # t-test, using pooled variance
Y_stats$mean, Y_stats$sd)) sdoPRE <- do(1000) * PRE(shuffle(Y) ~ X,
data = data_set) t.test(Tip ~ Condition, data =
data_set, var.equal=TRUE)
# bootstrap sampling distribution of
# randomize sampling distribution of
means # pairwise comparison
bootSDoM <- do(10000) * Fs
sdoF <- do(1000) * # corrections: "Bonferroni",
mean(resample(data_set$Y, 157)) “Tukey” (default) or "none"
fVal(shuffle(Y) ~ X, data = data_set)
pairwise(one_model, correction =
# counts extreme Fs "none")
tally(~fVal > sample_F, data = sdoF)

Page: 3 ▷ Updated: 2023-04 ▷ Learn more about CourseKata @ https://coursekata.org


Visualizations I
# faceted grid of histograms # density histogram
# histogram gf_histogram(~ Y, data = data_set) %>% gf_dhistogram(~ Y, data = data_set, fill
gf_histogram(~ Y, data = data_set) %>% gf_facet_grid(X ~ .) = "orange") %>%
# change labels gf_density()
gf_labs(title = "Graph Title", x
= "Y_Name", y = "Frequency")

# bar graph # boxplots # point plot


gf_bar( ~ Y, data = data_set) gf_boxplot(Y ~ X, data = data_set) gf_point(Y ~ X, data = data_set)

Page: 4 ▷ Updated: 2023-04 ▷ Learn more about CourseKata @ https://coursekata.org


Visualizations II
# boxplot overlaid with jitter plot # sampling distribution of b1
# jitter plot gf_boxplot(Y ~ X, data = data_set, fill gf_histogram(~b1, data = sdob1, fill =
gf_jitter(Y ~ X, data = data_set) = "orange") %>% ~middle(b1, .95)) %>%
gf_jitter(height = 0, alpha = .2, size # modify the limits on x- and y-axes
= 3) gf_lims(x = c(-12, 12), y = c(0, 70))

# Add model # CI for pairwise comparisons


gf_point(Y ~ X, data = data_set) %>% pairwise(one_model, plot = TRUE)
# add model predictions as red points
gf_point(Y ~ X , shape = 1, size = 3,
color = "firebrick") %>%
# add best fitting model as a red line
gf_model(one_model, color = “red”)

Page: 5 ▷ Updated: 2023-04 ▷ Learn more about CourseKata @ https://coursekata.org

You might also like