Credit Risk: Predictive machine model for loan default with R codes

This case study discusses steps to build loan default model using machine learning with R codes.

Download the dataset

Download the dataset by clicking on dataset no 4. entitled “Credit Risk: Predicting loan default”

Create a new project in R. If you are new to R programming, follow the steps given in the link to create a project in R

http://datasciencevidhya.com/post/how-to-create-a-project-in-r-studio

2. Set up working directory for your project

setwd(" ")

The data is in excel format.

For reading excel files in R, you will need R package, readxl. Install this package and then recall the package using library package

install.packages("readxl")

library("readxl")

dataDT <- read_excel("dataDT.xls")

dataset <- dataDT

Data Exploration

head(dataset)

branch ncust customer   age ed    employ address income
  <fct>  <dbl>    <dbl> <dbl> <fct>  <dbl>   <dbl>  <dbl>
1 3       3017    10012    28 2          7       2     44
2 3       3017    10017    64 5         34      17    116
3 3       3017    10030    40 1         20      12     61
4 3       3017    10039    30 1         11       3     27
5 3       3017    10069    25 1          2       2     30
6 3       3017    10071    35 1          2       9     38
# ... with 4 more variables: debtinc <dbl>,
#   creddebt <dbl>, othdebt <dbl>, default <fct>

Tail(dataset)

branch ncust customer   age ed    employ address income
  <fct>  <dbl>    <dbl> <dbl> <fct>  <dbl>   <dbl>  <dbl>
1 91      3779   453461    31 2          3       6     24
2 91      3779   453471    34 3          8       4     83
3 91      3779   453578    37 2         10       8     43
4 91      3779   453686    25 5          0       3     16
5 91      3779   453698    34 1         10       8     41
6 91      3779   453777    27 1          2       2     24
# ... with 4 more variables: debtinc <dbl>,

View(dataset)

str(dataset)

Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	1500 obs. of  12 variables:
 $ branch  : Factor w/ 15 levels "3","13","15",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ ncust   : num  3017 3017 3017 3017 3017 ...
 $ customer: num  10012 10017 10030 10039 10069 ...
 $ age     : num  28 64 40 30 25 35 26 25 65 21 ...
 $ ed      : Factor w/ 5 levels "1","2","3","4",..: 2 5 1 1 1 1 3 1 4 3 ...
 $ employ  : num  7 34 20 11 2 2 2 4 29 0 ...
 $ address : num  2 17 12 3 2 9 4 2 14 0 ...
 $ income  : num  44 116 61 27 30 38 38 30 189 23 ...
 $ debtinc : num  17.7 14.7 4.8 34.5 22.4 10.9 11.9 14.4 5 3.9 ...
 $ creddebt: num  2.99 5.05 1.04 1.75 0.76 1.46 0.95 1.05 3.36 0.31 ...
 $ othdebt : num  4.8 12 1.89 7.56 5.96 2.68 3.57 3.27 6.09 0.59 ...
 $ default : Factor w/ 2 levels "0","1": 1 1 1 1 2 2 2 1 1 1 ...

Assigning appropriate datatypes to variables in R like factor and numeric variable.

The command for doing this is as.factor() & as.numeric()

dataset$branch <- as.factor(dataset$branch)
dataset$ed <- as.factor(dataset$ed)
dataset$employ  <- as.numeric(dataset$employ)
dataset$address <- as.numeric(dataset$address)
dataset$income <- as.numeric(dataset$income)
dataset$debtinc <- as.numeric(dataset$debtinc)
dataset$creddebt <- as.numeric(dataset$creddebt)
dataset$othdebt <- as.numeric(dataset$othdebt)
dataset$default <- as.factor(dataset$default)

Recheck the structure of the data

str(dataset)

Missing value analysis

table(is.na(dataset$branch))
table(is.na(dataset$ncust))
table(is.na(dataset$customer))
table(is.na(dataset$age))
table(is.na(dataset$ed))
table(is.na(dataset$employ))
table(is.na(dataset$address))
table(is.na(dataset$income))
table(is.na(dataset$debtinc))
table(is.na(dataset$creddebt))
table(is.na(dataset$othdebt))
table(is.na(dataset$default))

Exploratory data analysis

> table(dataset$branch) 

  3  13  15  20  25  49  60  64  68  73  74  75  76  77 
100 100 100 100 100 100 100 100 100 100 100 100 100 100 
 91 
100 
> table(dataset$ncust)

1919 2251 2600 2658 3017 3080 3388 3491 3572 3779 4098 
 100  100  100  100  100  100  100  100  100  100  100 
4358 4501 4650 4809 
 100  100  100  100 
> table(dataset$ed)

  1   2   3   4   5 
246 527 333 310  84 
> table(dataset$default)

  0   1 
952 548 
> table(dataset$employ)

  0   1   2   3   4   5   6   7   8   9  10  11  12  13 
404 119 117  98  98  65  62  58  47  49  33  31  29  24 
 14  15  16  17  18  19  20  21  22  23  24  25  26  27 
 21  25  21  21  16   8  17  10  22   9  12   4  11   4 
 28  29  30  31  32  33  34  35  36  37  38  40  42  43 
  5   4   7   7   4   7   6   3   2   1   4   3   1   1 
 44  45  48  50  51  53  63 
  2   1   2   2   1   1   1

Decriptive statistics for continous variables

> summary(dataset$age)
Min. 1st Qu. Median Mean 3rd Qu. Max. 
18.00 24.00 31.00 34.17 42.00 79.00 
> summary(dataset$employ) # this variable has outliers
Min. 1st Qu. Median Mean 3rd Qu. Max. 
0.000 0.000 4.000 6.952 10.000 63.000 
> summary(dataset$address)
Min. 1st Qu. Median Mean 3rd Qu. Max. 
0.000 2.000 5.000 6.305 9.000 34.000 
> summary(dataset$income)
Min. 1st Qu. Median Mean 3rd Qu. Max. 
12.00 27.00 40.00 59.59 64.00 1079.00 
> summary(dataset$debtinc)
Min. 1st Qu. Median Mean 3rd Qu. Max. 
0.000 4.800 8.500 9.929 13.525 40.700 
> summary(dataset$creddebt) 
Min. 1st Qu. Median Mean 3rd Qu. Max. 
0.000 0.420 0.990 1.935 2.200 35.970 
> summary(dataset$othdebt) 
Min. 1st Qu. Median Mean 3rd Qu. Max. 
0.000 1.107 2.215 3.844 4.572 63.470 
> 
> sd(dataset$age)
[1] 13.14231
> sd(dataset$employ) # this variable has outliers
[1] 8.977644
> sd(dataset$address)
[1] 6.04774
> sd(dataset$income)
[1] 67.13016
> sd(dataset$debtinc)
[1] 6.671884
> sd(dataset$creddebt) 
[1] 2.973909
> sd(dataset$othdebt)
[1] 5.333425

Data Partitioning

set.seed(123)
pd <- sample(2,nrow(dataset),replace=TRUE,prob = c(0.7,0.3))
train <- dataset[pd==1,] # means All columns
View(train)
test <- dataset[pd==2,]
View(test)

## Logicstic regression
# Train the model using the training sets and check score
logistic <- glm(formula = default ~ age + ed + income+debtinc+othdebt, data = train, family ="binomial")
summary(logistic)

## Use the model to predict the default probability for the test data

predicted.probability <- predict(logistic, newdata = test, type = "response")
predicted.probability

# Now we wish to classify based on probability

# Suppose you wish to calculate classifcation
predicted.classes <- ifelse(predicted.probability > 0.5, "1", "0")
table(predicted.classes)

submit <- data.frame(CustomerId = test$customer, default_actual = test$default, default_prob = predicted.probability, default_class = predicted.classes)
write.csv(submit, file = "credit_logistic.csv", row.names = FALSE)

#missclasification error for test data sample
classification <- table(submit$default_actual,submit$default_class)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test

Missclassfication
1-sum(diag(classification)/sum(classification))

Confusion matrix using using caret packages

library(caret)
confusionMatrix(submit$default_class, submit$default_actual, positive ="1", mode="everything")

Plotting Reciever operating characteristics (ROC curve) and calculating Area under ROC


library(InformationValue)
# covert the factor to numeric before running this command
submit$default_class <- as.numeric(submit$default_class)
submit$default_actual <- as.numeric(submit$default_actual)
str(submit)
plotROC(submit$default_actual, submit$default_class)

AUROC(submit$default_actual, submit$default_class)

## decision trees

install.packages('rattle')
install.packages('rpart.plot')
install.packages('RColorBrewer')
library(rpart)
library(rattle)
library(rpart.plot)
library(RColorBrewer)

#decision tree with party package
# In rpart, the amount of detail is defined by two parameters:

#cp determines when the splitting up of the decision tree stops.
# minsplit determines the minimum amount of observations in a leaf of the tree.

fit <- rpart(default ~ ., data = train, method="class")
fit$variable.importance
View(fit)
fit1 <- rpart(default ~ age + income + debtinc + othdebt + employ + creddebt, data = train, method="class")
fit$variable.importance
plot(fit)
text(fit)
View(fit)
fancyRpartPlot(fit)

# Is this the optimal decision tree?
#Validating decision trees?
printcp(fit1)
# Cp is complexity parameter / minimum variance / explaination being added
# minimum error in the cp table
fit1$cptable[which.min(fit$cptable[,"xerror"])]

# Let us prune the tree
fit2 <- prune(fit1, cp = 0.01)
fit2$variable.importance

# examine the tree
# better visualization of the tree

plot(fit2)
text(fit2)
View(fit2)
fancyRpartPlot(fit2)

# Use the model for predicting prob or class for test data
Predicted_class <- predict(fit2, test, type = "class")

submit1 <- data.frame(CustomerId = test$customer, default_actual = test$default, default_class = Predicted_class)
write.csv(submit1, file = "myfirstdefaulttree.csv", row.names = FALSE)
library(caret)
confusionMatrix(submit1$default_class, submit1$default_actual, positive ="1", mode="everything")

#missclasification error for test data sample
clasification <- table(submit1$default_actual,submit1$default_class)
clasification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test

##Missclassfication
1-sum(diag(clasification)/sum(clasification))
# we got 30 % error
# afte running this we see the tabular classiffication and can decide wheteher model need to update or include variables
# here after running this we conclude this yes we need to improve the model as error in predicting 0 to 1 is very high


## Random forest
install.packages("randomForest")
library("randomForest")
# Random forest is an ensemble of decision trees
rfm_model <- randomForest(formula = default ~., data = train)
print(importance(rfm_model,type = 2))

rfm_model <- randomForest(formula = default ~ age + ed + employ + address + income + debtinc + creddebt + othdebt, data = train)

print(rfm_model)
# By default, number of trees is 500 and number of variables tried at each split is 2 
# Error rate is 25.6%
# Each decision tree gives a vote for the prediction of target variable. 
#Random forest choses the prediction that gets the most vote.

# Importance of each predictor.

rfm_model$importance

# from the output we conclude debtinc creddebt othdebt emply age are important predictors of 
# loan default

# Fine tuning parameters of Random Forest model
# mtry = number of variables tried at each split 
#ntree = number of decision trees, you fine tune these parameters to improve the accuracy and to 
# decrease the error rate in prediction
rfm_model2 <- randomForest(formula = default ~ age + ed + employ + address + income + debtinc + creddebt + othdebt, data = train, ntree = 500, mtry = 4, importance = TRUE)
print(rfm_model2)

# There is no improvement in the accuracy with this data, however generally the accuracy improves

# Using this model to predict on the test data

# Predicting on train set
prediction_RFM <- predict(rfm_model2, newdata = test, type = "class")

submit4 <- data.frame(CustomerId = test$customer, default_actual = test$default, default_class = prediction_RFM)
write.csv(submit4, file = "credit_rfm.csv", row.names = FALSE)
# Checking classification accuracy
table(submit4$default_actual, submit4$default_class)
classification <- table(submit4$default_actual, submit4$default_class)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test

##Missclassfication
1-sum(diag(classification)/sum(classification))

Credit Risk: Predictive machine model for loan default with R codes

Download the dataset

str(dataset)

Missing value analysis

Exploratory data analysis

Decriptive statistics for continous variables

Data Partitioning

Leave a Reply Cancel reply

Subscribe

Credit Risk: Predictive machine model for loan default with R codes

Download the dataset

str(dataset)Missing value analysis

Exploratory data analysis

Decriptive statistics for continous variables

Data Partitioning

Leave a Reply Cancel reply

str(dataset)

Missing value analysis