0% found this document useful (0 votes)
62 views7 pages

Credit Modelling in R

Uploaded by

Seshendra Vemuri
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
62 views7 pages

Credit Modelling in R

Uploaded by

Seshendra Vemuri
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 7

Credit Modelling in R

Now let us start using R for Credit Modelling:


The first thing we need to do is to load the R packages into the library:
# Load R packages into the library

# Data management packages

library(DescTools)

library(skimr)

library(plyr)

library(dplyr)

library(aod)

library(readxl)

# Visualization packages

library(Deducer)

library(ggplot2)

# Machine learnning method packages

library(ROCR)

library(pROC)

library(caret)

library(MASS)

Now it is time to load the dataset and do some data management. We will work with
the loan lending club dataset. The below coding is the data management:
# Import dataset

loan_data <- read.csv("/loan.csv")

# Selecting the relevant variables in the dataset:

loan_data <-
loan_data[,c("grade","sub_grade","term","loan_amnt","issue_d","loan_sta
tus","emp_length",

"home_ownership",
"annual_inc","verification_status","purpose","dti",

"delinq_2yrs","addr_state","int_rate",
"inq_last_6mths","mths_since_last_delinq",
"mths_since_last_record","open_acc","pub_rec"
,"revol_bal","revol_util","total_acc")]

# Data management for missing observations

loan_data$mths_since_last_delinq[is.na(loan_data$mths_since_last_delinq
)] <- 0

loan_data$mths_since_last_record[is.na(loan_data$mths_since_last_record
)] <- 0

var.has.na <- lapply(loan_data, function(x){any(is.na(x))})

num_na <- which( var.has.na == TRUE )

per_na <- num_na/dim(loan_data)[1]

loan_data <- loan_data[complete.cases(loan_data),]

Although this is the second step of a credit modeling analysis, the visualization step
can be found in my previous article, let us do minimum of visualization in case the
reader only reads this article:
# Visualization of the data

# Bar chart of the loan amount

loanamount_barchart <- ggplot(data=loan_data, aes(loan_data$loan_amnt))


+

geom_histogram(breaks=seq(0, 35000, by=1000),

col="black", aes(fill=..count..)) +

scale_fill_gradient("Count", low="green1", high="yellowgreen")+

labs(title="Loan Amount", x="Amount", y="Number of Loans")

loanamount_barchart

ggplotly(p = ggplot2::last_plot())

# Box plot of loan amount

box_plot_stat <- ggplot(loan_data, aes(loan_status, loan_amnt))

box_plot_stat + geom_boxplot(aes(fill = loan_status)) +

theme(axis.text.x = element_blank()) +

labs(list(title = "Loan amount by status", x = "Loan Status", y =


"Amount"))

ggplotly(p = ggplot2::last_plot())
The above coding gives us the following two visualizations:

Lets see some descriptive statistics of the data as well:


skim(loan_data)

Skim summary statistics

n obs: 886877
n variables: 23

-- Variable type:factor
--------------------------------------------------------

variable missing complete n n_unique


top_counts ordered

addr_state 0 886877 886877 51 CA: 129456, NY:


74033, TX: 71100, FL: 60901 FALSE

emp_length 0 886877 886877 12 10+: 291417, 2 y:


78831, < 1: 70538, 3 y: 69991 FALSE

grade 0 886877 886877 7 B: 254445, C:


245721, A: 148162, D: 139414 FALSE

home_ownership 0 886877 886877 6 MOR: 443319, REN:


355921, OWN: 87408, OTH: 180 FALSE

issue_d 0 886877 886877 103 Oct: 48619, Jul:


45938, Dec: 44323, Oct: 38760 FALSE

loan_status 0 886877 886877 8 Cur: 601533, Ful:


209525, Cha: 45956, Lat: 11582 FALSE

purpose 0 886877 886877 14 deb: 524009, cre:


206136, hom: 51760, oth: 42798 FALSE

sub_grade 0 886877 886877 35 B3: 56301, B4:


55599, C1: 53365, C2: 52206 FALSE

term 0 886877 886877 2


36: 620739, 60: 266138, NA: 0 FALSE

verification_status 0 886877 886877 3 Sou: 329393,


Ver: 290896, Not: 266588, NA: 0 FALSE

-- Variable type:numeric
-------------------------------------------------------

variable missing complete n mean sd p0


p25 p50 p75 p100 hist

annual_inc 0 886877 886877 75019.4 64687.38 0


45000 65000 90000 9500000 ????????

delinq_2yrs 0 886877 886877 0.31 0.86 0


0 0 0 39 ????????

dti 0 886877 886877 18.16 17.19 0


11.91 17.66 23.95 9999 ????????
inq_last_6mths 0 886877 886877 0.69 1 0
0 0 1 33 ????????

int_rate 0 886877 886877 13.25 4.38 5.32


9.99 12.99 16.2 28.99 ????????

loan_amnt 0 886877 886877 14756.97 8434.43 500


8000 13000 20000 35000 ????????

mths_since_last_delinq 0 886877 886877 16.62 22.89 0


0 0 30 188 ????????

mths_since_last_record 0 886877 886877 10.83 27.65 0


0 0 0 129 ????????

open_acc 0 886877 886877 11.55 5.32 1


8 11 14 90 ????????

pub_rec 0 886877 886877 0.2 0.58 0


0 0 0 86 ????????

revol_bal 0 886877 886877 16924.56 22414.33 0


6450 11879 20833 2904836 ????????

revol_util 0 886877 886877 55.07 23.83 0


37.7 56 73.6 892.3 ????????

total_acc 0 886877 886877 25.27 11.84 1


17 24 32 169 ????????
Next we need to do some more data management to prepare the dataset for machine
learning analysis
# Focus on the historical loans

loan_data=as.data.frame(loan_data[loan_data$loan_status!="Current", ])

limits_inc = quantile(loan_data$annual_inc, seq(0,1,0.1))

labels <- c(0, limits_inc[2:10], "+inf")

labels <- prettyNum(labels, big.mark = ",")

labels <- paste(labels[1:10], labels[2:11], sep = "-")

loan_data$annual_inc <- cut(loan_data$annual_inc, limits_inc, labels =


labels, include.lowest = T)

loan_data[,"annual_inc"] <- as.character(loan_data[,"annual_inc"])

# Create binary variables for the logistic regression analysis

# Annual_inc

loan_data$annual_inc[loan_data$annual_inc == "70,000- 80,000"|


loan_data$annual_inc == "80,000- 94,000" | loan_data$annual_inc ==
"94,000-120,000" | loan_data$annual_inc == "120,000- +inf" ] <- 1
loan_data$annual_inc[loan_data$annual_inc != 1] <- 0

loan_data$annual_inc <- as.numeric(loan_data$annual_inc)

# Home_ownership

loan_data$home_ownership <- as.character(loan_data$home_ownership)

loan_data$home_ownership[loan_data$home_ownership=="OWN" |
loan_data$home_ownership=="MORTGAGE" ] <- 1

loan_data$home_ownership[loan_data$home_ownership!=1] <- 0

# Dealinq_2yrs

loan_data$delinq_2yrs <- as.character(loan_data$delinq_2yrs)

loan_data$delinq_2yrs[loan_data$delinq_2yrs=="0"] <- 0

loan_data$delinq_2yrs[loan_data$delinq_2yrs!= 0] <- 1

# Verification status: if Verified = 1 ; otherwise = 0

loan_data$verification_status =
as.character(loan_data$verification_status)

loan_data$verification_status[loan_data$verification_status ==
"Verified" | loan_data$verification_status == "Source Verified"] = 1

loan_data$verification_status[loan_data$verification_status != 1] = 0

loan_data$verification_status=as.numeric(loan_data$verification_status)

# Dti

dti_quant <- quantile(loan_data$dti, seq(0, 1, 0.1))

labels = c(0,prettyNum(dti_quant[2:10], big.mark = ","), "+Inf")

labels = paste(labels[1:10],labels[2:11], sep = "-")

loan_data <- mutate(loan_data, dti= cut(loan_data$dti, breaks =


dti_quant, labels = factor(labels), include.lowest = T))

loan_data$dti <- as.character(loan_data$dti)

loan_data$dti[loan_data$dti == "0-6.57" | loan_data$dti == "12.13-


14.32" | loan_data$dti == "14.32-16.49" ] <- 1

loan_data$dti[loan_data$dti!=1] <- 0

# Status

loan_data$loan_status <- as.character(loan_data$loan_status)

loan_data$loan_status[loan_data$loan_status == "Charged Off" |


loan_data$loan_status == "Default" ] <- 1

loan_data$loan_status[loan_data$loan_status != 1] <- 0
table(loan_data$loan_status)

PercTable(loan_data$loan_status)

# Change to nummeric variables:

loan_data[,"revol_util"] <- as.numeric(sub("%",


"",loan_data$"revol_util", fixed =TRUE))/100

loan_data[,"int_rate"] <- as.numeric(sub("%", "",loan_data$"int_rate",


fixed =TRUE))/100

loan_data$loan_status <- as.numeric(loan_data$loan_status)

# Grouping variables

loan_data$purpose <- as.character(loan_data$purpose)

loan_data$purpose[loan_data$purpose == "car" | loan_data$purpose ==


"major_purchase" |

loan_data$purpose == "home_improvement"|
loan_data$purpose == "credit_card" ] <- 2

loan_data$purpose[loan_data$purpose == "moving" | loan_data$purpose ==


"small_business" |

loan_data$purpose == "renewable_energy" ] <- 0

loan_data$purpose[loan_data$purpose!= 0 & loan_data$purpose!= 2 ] <- 1

loan_data$purpose <- as.factor(loan_data$purpose)

Now it is time to make the machine learning regression analysis. We will work with
multiple logistic regression. Logistic regression is applied when you have a binary
variable (y) to explain. The logistic regression model uses the cumulative distribution
function to estimate the logistic function of the model with a group of explanatory
variables (the x’s). We will work with a stepwise model in order to find a final model
for the logistic regression. The below coding generates the multiple logistic regression
analysis:

You might also like