Credit Risk: Predictive machine model for loan default with R codes

This case study discusses steps to build loan default model using machine learning with R codes.

Download the dataset

Download the dataset by clicking on dataset no 4. entitled  “Credit Risk: Predicting loan default”

  1. Create a new project in R. If you are new to R programming, follow the steps given in the link to create a project in R

http://datasciencevidhya.com/post/how-to-create-a-project-in-r-studio

2. Set up working directory for your project

            setwd("     ")

The data is in excel format. 

For reading excel files in R, you will need R package, readxl. Install this package and then recall the package using library package

install.packages("readxl")

library("readxl")

dataDT <- read_excel("dataDT.xls")

dataset <- dataDT

  • Data Exploration
head(dataset)

branch ncust customer age ed employ address income
<fct> <dbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl>
1 3 3017 10012 28 2 7 2 44
2 3 3017 10017 64 5 34 17 116
3 3 3017 10030 40 1 20 12 61
4 3 3017 10039 30 1 11 3 27
5 3 3017 10069 25 1 2 2 30
6 3 3017 10071 35 1 2 9 38
# ... with 4 more variables: debtinc <dbl>,
# creddebt <dbl>, othdebt <dbl>, default <fct>

Tail(dataset)

branch ncust customer age ed employ address income
<fct> <dbl> <dbl> <dbl> <fct> <dbl> <dbl> <dbl>
1 91 3779 453461 31 2 3 6 24
2 91 3779 453471 34 3 8 4 83
3 91 3779 453578 37 2 10 8 43
4 91 3779 453686 25 5 0 3 16
5 91 3779 453698 34 1 10 8 41
6 91 3779 453777 27 1 2 2 24
# ... with 4 more variables: debtinc <dbl>,

View(dataset)

str(dataset)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 1500 obs. of 12 variables:
$ branch : Factor w/ 15 levels "3","13","15",..: 1 1 1 1 1 1 1 1 1 1 ...
$ ncust : num 3017 3017 3017 3017 3017 ...
$ customer: num 10012 10017 10030 10039 10069 ...
$ age : num 28 64 40 30 25 35 26 25 65 21 ...
$ ed : Factor w/ 5 levels "1","2","3","4",..: 2 5 1 1 1 1 3 1 4 3 ...
$ employ : num 7 34 20 11 2 2 2 4 29 0 ...
$ address : num 2 17 12 3 2 9 4 2 14 0 ...
$ income : num 44 116 61 27 30 38 38 30 189 23 ...
$ debtinc : num 17.7 14.7 4.8 34.5 22.4 10.9 11.9 14.4 5 3.9 ...
$ creddebt: num 2.99 5.05 1.04 1.75 0.76 1.46 0.95 1.05 3.36 0.31 ...
$ othdebt : num 4.8 12 1.89 7.56 5.96 2.68 3.57 3.27 6.09 0.59 ...
$ default : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 1 ...

Assigning appropriate datatypes to variables in R like factor and numeric variable.


The command for doing this is as.factor() & as.numeric()

dataset$branch <- as.factor(dataset$branch)
dataset$ed <- as.factor(dataset$ed)
dataset$employ <- as.numeric(dataset$employ)
dataset$address <- as.numeric(dataset$address)
dataset$income <- as.numeric(dataset$income)
dataset$debtinc <- as.numeric(dataset$debtinc)
dataset$creddebt <- as.numeric(dataset$creddebt)
dataset$othdebt <- as.numeric(dataset$othdebt)
dataset$default <- as.factor(dataset$default)

Recheck the structure of the data

str(dataset)

Missing value analysis

table(is.na(dataset$branch))
table(is.na(dataset$ncust))
table(is.na(dataset$customer))
table(is.na(dataset$age))
table(is.na(dataset$ed))
table(is.na(dataset$employ))
table(is.na(dataset$address))
table(is.na(dataset$income))
table(is.na(dataset$debtinc))
table(is.na(dataset$creddebt))
table(is.na(dataset$othdebt))
table(is.na(dataset$default))

Exploratory data analysis

> table(dataset$branch) 

3 13 15 20 25 49 60 64 68 73 74 75 76 77
100 100 100 100 100 100 100 100 100 100 100 100 100 100
91
100
> table(dataset$ncust)

1919 2251 2600 2658 3017 3080 3388 3491 3572 3779 4098
100 100 100 100 100 100 100 100 100 100 100
4358 4501 4650 4809
100 100 100 100
> table(dataset$ed)

1 2 3 4 5
246 527 333 310 84
> table(dataset$default)

0 1
952 548
> table(dataset$employ)

0 1 2 3 4 5 6 7 8 9 10 11 12 13
404 119 117 98 98 65 62 58 47 49 33 31 29 24
14 15 16 17 18 19 20 21 22 23 24 25 26 27
21 25 21 21 16 8 17 10 22 9 12 4 11 4
28 29 30 31 32 33 34 35 36 37 38 40 42 43
5 4 7 7 4 7 6 3 2 1 4 3 1 1
44 45 48 50 51 53 63
2 1 2 2 1 1 1

Decriptive statistics for continous variables

> summary(dataset$age)
Min. 1st Qu. Median Mean 3rd Qu. Max.
18.00 24.00 31.00 34.17 42.00 79.00
> summary(dataset$employ) # this variable has outliers
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.000 4.000 6.952 10.000 63.000
> summary(dataset$address)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 2.000 5.000 6.305 9.000 34.000
> summary(dataset$income)
Min. 1st Qu. Median Mean 3rd Qu. Max.
12.00 27.00 40.00 59.59 64.00 1079.00
> summary(dataset$debtinc)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 4.800 8.500 9.929 13.525 40.700
> summary(dataset$creddebt)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 0.420 0.990 1.935 2.200 35.970
> summary(dataset$othdebt)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 1.107 2.215 3.844 4.572 63.470
>
> sd(dataset$age)
[1] 13.14231
> sd(dataset$employ) # this variable has outliers
[1] 8.977644
> sd(dataset$address)
[1] 6.04774
> sd(dataset$income)
[1] 67.13016
> sd(dataset$debtinc)
[1] 6.671884
> sd(dataset$creddebt)
[1] 2.973909
> sd(dataset$othdebt)
[1] 5.333425

Data Partitioning

set.seed(123)
pd <- sample(2,nrow(dataset),replace=TRUE,prob = c(0.7,0.3))
train <- dataset[pd==1,] # means All columns
View(train)
test <- dataset[pd==2,]
View(test)

## Logicstic regression
# Train the model using the training sets and check score
logistic <- glm(formula = default ~ age + ed + income+debtinc+othdebt, data = train, family ="binomial")
summary(logistic)

## Use the model to predict the default probability for the test data

predicted.probability <- predict(logistic, newdata = test, type = "response")
predicted.probability

# Now we wish to classify based on probability

# Suppose you wish to calculate classifcation
predicted.classes <- ifelse(predicted.probability > 0.5, "1", "0")
table(predicted.classes)
submit <- data.frame(CustomerId = test$customer, default_actual = test$default, default_prob = predicted.probability, default_class = predicted.classes)
write.csv(submit, file = "credit_logistic.csv", row.names = FALSE)
#missclasification error for test data sample
classification <- table(submit$default_actual,submit$default_class)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test
Missclassfication
1-sum(diag(classification)/sum(classification))
Confusion matrix using using caret packages

library(caret)
confusionMatrix(submit$default_class, submit$default_actual, positive ="1", mode="everything")

Plotting Reciever operating characteristics (ROC curve) and calculating Area under ROC


library(InformationValue)
# covert the factor to numeric before running this command
submit$default_class <- as.numeric(submit$default_class)
submit$default_actual <- as.numeric(submit$default_actual)
str(submit)
plotROC(submit$default_actual, submit$default_class)

AUROC(submit$default_actual, submit$default_class)

## decision trees

install.packages('rattle')
install.packages('rpart.plot')
install.packages('RColorBrewer')
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)

#decision tree with party package
# In rpart, the amount of detail is defined by two parameters:

#cp determines when the splitting up of the decision tree stops.
# minsplit determines the minimum amount of observations in a leaf of the tree.

fit <- rpart(default ~ ., data = train, method="class")
fit$variable.importance
View(fit)
fit1 <- rpart(default ~ age + income + debtinc + othdebt + employ + creddebt, data = train, method="class")
fit$variable.importance
plot(fit)
text(fit)
View(fit)
fancyRpartPlot(fit)

# Is this the optimal decision tree?
#Validating decision trees?
printcp(fit1)
# Cp is complexity parameter / minimum variance / explaination being added
# minimum error in the cp table
fit1$cptable[which.min(fit$cptable[,"xerror"])]

# Let us prune the tree
fit2 <- prune(fit1, cp = 0.01)
fit2$variable.importance

# examine the tree
# better visualization of the tree

plot(fit2)
text(fit2)
View(fit2)
fancyRpartPlot(fit2)

# Use the model for predicting prob or class for test data
Predicted_class <- predict(fit2, test, type = "class")

submit1 <- data.frame(CustomerId = test$customer, default_actual = test$default, default_class = Predicted_class)
write.csv(submit1, file = "myfirstdefaulttree.csv", row.names = FALSE)
library(caret)
confusionMatrix(submit1$default_class, submit1$default_actual, positive ="1", mode="everything")

#missclasification error for test data sample
clasification <- table(submit1$default_actual,submit1$default_class)
clasification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test

##Missclassfication
1-sum(diag(clasification)/sum(clasification))
# we got 30 % error
# afte running this we see the tabular classiffication and can decide wheteher model need to update or include variables
# here after running this we conclude this yes we need to improve the model as error in predicting 0 to 1 is very high


## Random forest
install.packages("randomForest")
library("randomForest")
# Random forest is an ensemble of decision trees
rfm_model <- randomForest(formula = default ~., data = train)
print(importance(rfm_model,type = 2))

rfm_model <- randomForest(formula = default ~ age + ed + employ + address + income + debtinc + creddebt + othdebt, data = train)

print(rfm_model)
# By default, number of trees is 500 and number of variables tried at each split is 2
# Error rate is 25.6%
# Each decision tree gives a vote for the prediction of target variable.
#Random forest choses the prediction that gets the most vote.

# Importance of each predictor.

rfm_model$importance

# from the output we conclude debtinc creddebt othdebt emply age are important predictors of
# loan default

# Fine tuning parameters of Random Forest model
# mtry = number of variables tried at each split
#ntree = number of decision trees, you fine tune these parameters to improve the accuracy and to
# decrease the error rate in prediction
rfm_model2 <- randomForest(formula = default ~ age + ed + employ + address + income + debtinc + creddebt + othdebt, data = train, ntree = 500, mtry = 4, importance = TRUE)
print(rfm_model2)

# There is no improvement in the accuracy with this data, however generally the accuracy improves

# Using this model to predict on the test data

# Predicting on train set
prediction_RFM <- predict(rfm_model2, newdata = test, type = "class")

submit4 <- data.frame(CustomerId = test$customer, default_actual = test$default, default_class = prediction_RFM)
write.csv(submit4, file = "credit_rfm.csv", row.names = FALSE)
# Checking classification accuracy
table(submit4$default_actual, submit4$default_class)
classification <- table(submit4$default_actual, submit4$default_class)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test

##Missclassfication
1-sum(diag(classification)/sum(classification))

Leave a Reply

Your email address will not be published. Required fields are marked *