R Script Module 3
R Script Module 3
# #
# #
############################################################
############################################################
############################################################
############################################################
############################################################
# Set your directory to the folder where you have downloaded the Credit Scoring
dataset
# To clean up the memory of your current R session run the following line
rm(list=ls(all=TRUE))
str(data) # The str() function shows the structure of your dataset and details the
type of variables that it contains
summary(data) # The summary() function provides for each variable in your dataset
the minimum, mean, maximum and quartiles
############################################################
############################################################
# Set your directory to the folder where you have downloaded the HR Analytics 2
dataset
# To clean up the memory of your current R session run the following line
rm(list=ls(all=TRUE))
# Let's load our dataset and call it datatot
# Now let's have a look at our variables and see some summary statistics
str(datatot) # The str() function shows the structure of your dataset and details
the type of variables that it contains
sum((logreg$fitted.values<=cutoff)&(datatot$left==0))/sum(datatot$left==0) #
Compute the percentage of correctly classified employees who stayed
sum((logreg$fitted.values>cutoff)&(datatot$left==1))/sum(datatot$left==1) # Compute
the percentage of correctly classified employees who left
# Let's use a more visual way to see the effect of one of the most important
driver: TIC
# An aggregated plot
tempdata=datatot
symbols(aggbTimeRank$TIC,aggbTimeRank$left,circles=cntbTimeRank$left, inches=.75,
fg="white", bg="red",main= "Time and Employee Attrition", ylab="Average Attrition
Rate", xlab= "Time spent") # we
# Let's use a more visual way to see the effect of the most important driver:
Satisfaction
tempdata=datatot
symbols(aggbSatisRank$rankSatis,aggbSatisRank$left,circles=cntbSatisRank$left,
inches=.2, fg="white", bg="red",main= "Satisfaction and Employee Attrition",
ylab="Average Attrition Rate", xlab= "Rank of Satisfaction")
## The Bubble Charts in the last 2 examples can be made more rigorously by
adjusting
#######
## Addendum A ##
#######
## Human perceive area of shapes like circles. So if some value is twice as large,
## we want the area fo the circle to be twice as large, not the radius. The
symbols() function
## takes the radius of the circles by default, so we need to compute the radius in
order to end up with the desired size of circles.
size = cntbTimeRank$left
#######
## Addendum B ##
#######
bins = 20
## Note that the first bin 0-0.05 has no employees (so the circle should be size 0)
## Again here, we want to size the circles by their area, not radius