0% found this document useful (0 votes)
28 views7 pages

List of Functions

The document describes 12 functions for exploratory data analysis including computing correlations, plotting scatterplots and histograms, and checking for normality. It provides the code for functions to compute Pearson correlation using vectors, dataframes, and multiple variables. Additional functions allow plotting correlations, reading CSV files, performing correlation plots, and checking for normality.

Uploaded by

Cyd Duque
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
28 views7 pages

List of Functions

The document describes 12 functions for exploratory data analysis including computing correlations, plotting scatterplots and histograms, and checking for normality. It provides the code for functions to compute Pearson correlation using vectors, dataframes, and multiple variables. Additional functions allow plotting correlations, reading CSV files, performing correlation plots, and checking for normality.

Uploaded by

Cyd Duque
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 7

# --------------------------------------------------------------------

# Develop by Prof. Carlito O. Daarol


# Math Department
# Mindanao State University
# General Santos City
# September 12, 2023

#
# List of functions intended as support for Exploratory Data Analysis
# compute correlation coefficient
# plot scatterplot and graphics
# Check for normality
# --------------------------------------------------------------------

# 1. correlation - function to compute Pearson r correlation using the summation


formula
# using two vectors or arrays X and Y

# 2. Correcorre - function to compute correlation using a dataframe of any size as


input

# 3. pairwiseCor - function to compute pairwise correlations from two sets of


variables

# 4. singlesetCor - function to compute pairwise correlation from 1 set of


variables

# 5. CorrsjPlot - function to compute pairwise correlation using a dataframe and


sjPlot package

# 6. DrawCorrelations - function to draw correlation image or heat map using R


package

# 7 .Read csv - function to read csv file data from any location

# 8. CorrePlotXY - function to Perform correlation plot for two variables

# 9. CheckforNormality - function to check for normality of any numeric variable


# using Wilk-Shapiro test

# 10. QQNormality_Plot - function to check and plot nornmality of distribution

# 11. QQPlot - function to check and plot nornmality using R package

# 12 PlotHistDensity - function to Plot Histogram and Density using a dataframe as


input

#------------------------------------------------------------
# Function to compute Pearson Correlation Using the formula
# using two vectors as input data
# -----------------------------------------------------------

correlation = function(datX,datY){
sumX <- sum(datX)
sumY <- sum(datY)
sumXY <- sum(datX*datY)
sumxsqr <- sum(datX*datX)
sumysqr <- sum(datY*datY)
n <- length(datX)

numerator <- n*sumXY - sumX*sumY


denominator <- (n*sumxsqr - sumX*sumX) * ((n*sumysqr - sumY*sumY))
r = numerator / sqrt(denominator)
return(r)
}

# -----------------------------------------------------------
# Function to compute Pearson Correlation Using a dataframe
# the dataframe may cocnsist several columns
# -----------------------------------------------------------

Correcorre = function(data){
counter = 0
result <- NULL

limit1 <- ncol(data)-1


limit2 <- ncol(data)

for (i in 1:limit1){
for ( j in (i+1):limit2){
counter <- counter +1
result[counter] <- correlation(data[,i],data[,j])
}
}
return(result)
}

# --------------------------------------------------------------------------
# Function to compute Pearson Correlation using two sets of variables
# -------------------------------------------------------------------------

pairwiseCor <- function(depvar,indepvar,corrtype){

# depvar, indepvar and pairwiseCor are temporary variables.


# you can choose any name that you want

# establish counter to count how many pairs are possible


counter =0
for(i in 1:length(depvar)){
for (j in 1:length(indepvar)){
counter = counter +1
}
}

# define holder of correlation results


df <- data.frame(VarX =rep(0,counter), VarY=rep(0,counter),
AbsCor=rep(0,counter), Cor=rep(0,counter),
PValue=rep(0,counter),
Assessment=rep(0,counter))

# fill in the dataframe with details


k=0
for(i in 1:length(depvar)){
for (j in 1:length(indepvar)){
k=k+1
df[k,1] <- names(depvar)[i] # pick
the first variable
df[k,2] <- names(indepvar)[j] #
pickup the second variable
df[k,3] <- round(abs(cor(depvar[,i],indepvar[,j])),2) #
compute the absolute value of the correlation
df[k,4] <- round(cor(depvar[,i],indepvar[,j]),2) #
compute the correlation coefficient (pos or negative result)
tt <- cor.test(depvar[,i],indepvar[,j],method=corrtype) #
defines the tt object to store other info like pvalue
df[k,5] <- round(tt$p.value,2) #
pickup the p=value for test of significance

msg1 <- "Reject Ho. Linear Relationship is true"


msg2 <- "Not correlated variables"
df[k,6] <- ifelse((df[k,5] < 0.05), msg1, msg2)
}
}
colnames(df) <- c("VariableX", "VariableY", "AbsCor","Cor","PValue","Assessment
(Pvalue <0.05)")
return(df) #
returns the entire result as a dataframe
}

# -----------------------------------------------------------------------
# Function to compute pairwise using s single set of variables
# -----------------------------------------------------------------------

# function to compute pairwise correlation from a single sets


singlesetCor <- function(singleset,corrtype){

# establish counter to count how many pairs are possible


numcol = ncol(singleset)

counter =0
for(i in 1:(numcol-1)){
j= i+1
for (k in j:numcol){
counter = counter +1
}
counter
}

# define holder of correlation results


df <- data.frame(VarX =rep(0,counter), VarY=rep(0,counter),
AbsCor=rep(0,counter), Cor=rep(0,counter),
PValue=rep(0,counter),
Assessment=rep(0,counter))
# fill in the dataframe with details
k=0
for(i in 1:(numcol-1)){
n= i+1
for (j in n:numcol){
k = k +1
df[k,1] <- names(singleset)[i] #
pick the first variable
df[k,2] <- names(singleset)[j] #
pickup the second variable
df[k,3] <- round(abs(cor(singleset[,i],singleset[,j])),2) #
compute the absolute value of the correlation
df[k,4] <- round(cor(singleset[,i],singleset[,j]),2) #
compute the correlation coefficient (pos or negative result)
tt <- cor.test(singleset[,i],singleset[,j],method=corrtype) #
defines the tt object to store other info like pvalue
df[k,5] <- round(tt$p.value,2) #
pickup the p=value for test of significance
df[k,6] <- ifelse((df[k,5] < 0.05), "<0.05 (sig)", "> 0.05 (not sig") #
determines if coefficient is significant or not
}
}
return(df) #
returns the entire result as a dataframe
}

# -----------------------------------------------------------
# function to compute pairwise correlation using R packages
# -----------------------------------------------------------

CorrsjPlot <- function(data,corrtype,mtitle){

library(sjPlot)
tab_corr(data,
na.deletion = "pairwise",
corr.method = corrtype,
title = mtitle,
var.labels = NULL,
wrap.labels = 40,
show.p = TRUE,
p.numeric = FALSE,
fade.ns = TRUE,
val.rm = NULL,
digits = 3,
triangle = "lower",
string.diag = NULL,
CSS = NULL,
encoding = NULL,
file = NULL,
use.viewer = TRUE,
remove.spaces = TRUE)
}

# ----------------------------------------------------
# function to draw Correlations using R packages
# ---------------------------------------------------
DrawCorrelations = function(cordata){

require(mvtnorm)
par(mfrow = c(2, 3), mar = 0.1+c(4,4,1,1), oma = c(0, 0, 2, 0))

xlen <- length(cordata)


for (i in 1:xlen){
S <- matrix(c(1,cordata[i],cordata[i],1),2,2)
AB <- rmvnorm(mean=c(0,0),sig=S,n=100)
U <- pnorm(AB) #Now U is uniform

x <- qnorm(U[,1],1,.02) #y is beta distributed with two shape parameters 1 and


2
y <- qnorm(U[,2],1,.02) #y is beta distributed with two shape parameters 1 and
2

corval <- cor(x,y)


plot(x,y, main = paste0(" Pearson r = ",round(corval,3)))
fit <- lm(y~x)
abline(fit, col = "red")
}
par(mfrow=c(1,1))
#return(invisible())
}

# Read csv data from any location

readcsv <- function(datapath,dataname){


data <- read.csv(paste(datapath,"/",dataname,sep=""))
return(data)
}

## Perform correlation plot for two variables


CorrePlotXY <- function(data,X,Y,color,Xlab,Ylab,corrtype){

library(ggpubr)
ggscatter(data, x = X, y = Y,
color = color, cor.coef = TRUE,
cor.method = corrtype,
xlab = Xlab, ylab = Ylab,
add= "reg.line")
}

CheckforNormality = function(dat){
library(dplyr)

df <- dat %>%


group_by(variable) %>%
summarise(`W Statistic` = shapiro.test(Score)$statistic,
`p-value` = shapiro.test(Score)$p.value)
df <- as.data.frame(df)
df$`Variable Distribution Assessment` <- ""

for (k in 1:nrow(df)){

df[k,2] <- round(df[k,2],3)


df[k,3] <- round(df[k,3],3)
pval <- round(df[k,3],2)
if (pval < 0.05) {
msg <- paste0(df[k,1] ," is not normal(pval<.05)")
} else {
msg <- paste0(df[k,1] ," distribution is normal")
}
df[k,4] <- msg
}
return(df)
}

# Function call: Normality Test


QQNormality_Plot = function(data_long){

colnames(data_long) <- c("variable","value")


ggplot(data = data_long, mapping = aes(sample = value, color = variable, fill =
variable)) +
stat_qq_band(alpha=0.5, conf=0.95, qtype=1, bandType = "boot", B=5000) +
stat_qq_line(identity=TRUE) +
stat_qq_point(col="black") +
ggtitle("QQ Plot for Normality of Distribution \n Points must lie inside the
confidence band")+
facet_wrap(~ variable, scales = "free") +
labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + theme_bw()
}

# Function call: Normality Test


QQPlot = function(data_long, Toword, fileout){

library(ggplot2)
ggplot(data = data_long, mapping = aes(sample = value, color = variable, fill =
variable)) +
stat_qq_band(alpha=0.5, conf=0.95, qtype=1, bandType = "boot", B=500) +
stat_qq_line(identity=TRUE) +
stat_qq_point(col="black") +
ggtitle("Quantile-Quantile Test for Normality \n Normally distributed if points
are inside the confidence band")+
facet_wrap(~ variable, scales = "free") +
labs(x = "Theoretical Quantiles", y = "Sample Quantiles") + theme_bw()

return(invisible())
}
PlotHistDensity = function(vars,data_wide){
par(mfrow = c(3,2))
for (i in 1:3){
dat <- na.omit(data_wide[,i])
plot(dat)
x = seq(from=min(dat), to=max(dat), length.out=50)
norm_dist = dnorm(x, mean=mean(dat), sd=sd(dat)) * (max(dat)-min(dat))/
20*length(dat)
hist(dat,main=colnames(data_wide)[i])
lines(x, norm_dist, col='violet',lwd=4)
}
par(mfrow = c(1,1))
}

You might also like