Automate loan approval process based on customer data with R programming

Case description

This case study will discuss ways to take credit decisions whether to sanction a loan or not based on the customers data using R programming.

Data / Variables. This case study uses data from analyticsvidhya.com

Data
Variable	Description
Loan_ID	Unique Loan ID
Gender	Male/ Female
Married	Applicant married (Y/N)
Dependents	Number of dependents
Education	Applicant Education (Graduate/ Under Graduate)
Self_Employed	Self employed (Y/N)
ApplicantIncome	Applicant income
CoapplicantIncome	Coapplicant income
LoanAmount	Loan amount in thousands
Loan_Amount_Term	Term of loan in months
Credit_History	credit history meets guidelines
Property_Area	Urban/ Semi Urban/ Rural
Loan_Status	Loan approved (Y/N)

Problem – Whether to approve loan or not (Y / N)

Download dataset to practice

Solution with R codes

1. Download and save the above dataset on your desktop.

2. Create a project in R

3. Link the saved dataset and set up the working directory

R codes

Read the data into R

#read.csv coerces strings to factors
train <- read.csv("train.csv", header=TRUE)
test <- read.csv("test.csv", header=TRUE)

# Print train and test to the console
train
test

View(train)
View(test)

# Exploring the variables in the dataset
head(train)
head(test)

# Exploring the data types in the data set
str(train)
str(test)

# Exploratory data anaysis
# Converting categorical variables into factors using command as.factor
# convert all int variable into numeric as.numeric

train$Gender <- as.factor(train$Gender)
train$Married <- as.factor(train$Married)
train$Dependents <- as.factor(train$Dependents)
train$Education <- as.factor(train$Education)
train$Self_Employed <- as.factor(train$Self_Employed)
train$ApplicantIncome <- as.numeric(train$ApplicantIncome)
train$CoapplicantIncome <- as.numeric(train$CoapplicantIncome)
train$LoanAmount <- as.numeric(train$LoanAmount)
train$Loan_Amount_Term <- as.numeric(train$Loan_Amount_Term)
train$Credit_History <- as.factor(train$Credit_History)
train$Property_Area <- as.factor(train$Property_Area)
train$Loan_Status <- as.factor(train$Loan_Status)

# Frequency tables for categorical variables

table(train$Gender)
table(train$Married)
table(train$Dependents)
table(train$Education)
table(train$Self_Employed)
table(train$Credit_History)
table(train$Property_Area)

# approval rate in numbers
table(train$Loan_Status)

# descriptive statistics for continous variables
summary(train$ApplicantIncome)
mean(train$ApplicantIncome)
sd(train$ApplicantIncome)

hist(train$ApplicantIncome)
boxplot(train$ApplicantIncome)

summary(train$CoapplicantIncome)
mean(train$CoapplicantIncome)
sd(train$CoapplicantIncome)

summary(train$LoanAmount)
mean(train$LoanAmount) ## This will give NA, include na.rm = TRUE to deal with this
mean(train$LoanAmount, na.rm = TRUE)
sd(train$LoanAmount, na.rm = TRUE)

summary(train$Loan_Amount_Term)
mean(train$Loan_Amount_Term, na.rm = TRUE)
sd(train$Loan_Amount_Term, na.rm = TRUE)

# for test data

test$Gender <- as.factor(test$Gender)
test$Married <- as.factor(test$Married)
test$Dependents <- as.factor(test$Dependents)
test$Education <- as.factor(test$Education)
test$Self_Employed <- as.factor(test$Self_Employed)
test$Credit_History <- as.factor(test$Credit_History)
test$Property_Area <- as.factor(test$Property_Area)
test$ApplicantIncome <- as.numeric(test$ApplicantIncome)
test$CoapplicantIncome <- as.numeric(test$CoapplicantIncome)
test$LoanAmount <- as.numeric(test$LoanAmount)
test$Loan_Amount_Term <- as.numeric(test$Loan_Amount_Term)

## Descriptive statistics for test data

table(test$Married)
table(test$Gender)
table(test$Dependents)
table(test$Education)
table(test$Self_Employed)
table(test$Credit_History)
table(test$Property_Area)

# descriptive statistics for continous variables

summary(test$ApplicantIncome)
mean(test$ApplicantIncome)
sd(test$ApplicantIncome)

summary(test$CoapplicantIncome)
mean(test$CoapplicantIncome)
sd(test$CoapplicantIncome)

summary(test$LoanAmount)
mean(test$LoanAmount, na.rm = TRUE)
sd(test$LoanAmount, na.rm = TRUE)

summary(test$Loan_Amount_Term)
mean(test$Loan_Amount_Term, na.rm = TRUE)
sd(test$Loan_Amount_Term, na.rm = TRUE)

# missing values analysis
# the dataset has space as missing value rather than NA value. R doesn't consider space as missing values.
# Therefore we have to give command to R that 'space' is a missing value in the dataset
# Thereforea replace 'space' with NA

train$Gender[train$Gender==""] <- NA
# Check the frequency of missing values
table(is.na(train$Gender))

# TRUE indicates number of missing values

train$Married[train$Married==""] <- NA
table(is.na(train$Married))

train$Dependents[train$Dependents==""] <- NA
table(is.na(train$Dependents))

train$Education[train$Education==""] <- NA
table(is.na(train$Education))

train$Self_Employed[train$Self_Employed==""] <- NA
table(is.na(train$Self_Employed))

train$ApplicantIncome[train$ApplicantIncome==""] <- NA
table(is.na(train$ApplicantIncome))

train$CoapplicantIncome[train$CoapplicantIncome==""] <- NA
table(is.na(train$CoapplicantIncome))

train$LoanAmount[train$LoanAmount==""] <- NA
table(is.na(train$LoanAmount))

train$Loan_Amount_Term[train$Loan_Amount_Term==""] <- NA
table(is.na(train$Loan_Amount_Term))

train$Credit_History[train$Credit_History==""] <- NA
table(is.na(train$Credit_History))

train$Property_Area[train$Property_Area==""] <- NA
table(is.na(train$Property_Area))

## Missing values for test data

test$Gender[test$Gender==""] <- NA
# Check the frequency of missing values
table(is.na(test$Gender))

# TRUE indicates number of missing values

test$Married[test$Married==""] <- NA
table(is.na(test$Married))

test$Dependents[test$Dependents==""] <- NA
table(is.na(test$Dependents))

test$Education[test$Education==""] <- NA
table(is.na(test$Education))

test$Self_Employed[test$Self_Employed==""] <- NA
table(is.na(test$Self_Employed))

test$ApplicantIncome[test$ApplicantIncome==""] <- NA
table(is.na(test$ApplicantIncome))

test$CoapplicantIncome[test$CoapplicantIncome==""] <- NA
table(is.na(test$CoapplicantIncome))

test$LoanAmount[test$LoanAmount==""] <- NA
table(is.na(test$LoanAmount))

test$Loan_Amount_Term[test$Loan_Amount_Term==""] <- NA
table(is.na(test$Loan_Amount_Term))

test$Credit_History[test$Credit_History==""] <- NA
table(is.na(test$Credit_History))

test$Property_Area[test$Property_Area==""] <- NA
table(is.na(test$Property_Area))

## Handling missing values
# The first option is to delete / drop the missing values from the analysis
train1 <- na.omit(train)
str(train1)

test1 <- na.omit(test)
str(test1)
# Since we have less observation we will not drop drop the missing values

## Missing value imputation
# From the above analysis, we got to know that following variables have missing values
# Gender, Married, Dependents, Self_employed, LoanAmount, Loan_Amount_Term, Credit_History

train$Gender[is.na(train$Gender)] <- "Male"
train$Married[is.na(train$Married)] <- "Yes"
train$Dependents[is.na(train$Dependents)] <- "0"
train$Self_Employed[is.na(train$Dependents)] <- "0"
train$Self_Employed[is.na(train$Self_Employed)] <- "No"
train$Credit_History[is.na(train$Credit_History)] <- "1"
train$LoanAmount[is.na(train$LoanAmount)] <- mean(train$LoanAmount, na.rm = TRUE)

train$Loan_Amount_Term[is.na(train$Loan_Amount_Term)] <- mean(train$Loan_Amount_Term, na.rm = TRUE)

# For test data

test$Gender[is.na(test$Gender)] <- "Male"
test$Dependents[is.na(test$Dependents)] <- "0"
test$Self_Employed[is.na(test$Self_Employed)] <- "No"
test$Credit_History[is.na(test$Credit_History)] <- "1"
test$LoanAmount[is.na(test$LoanAmount)] <- mean(test$LoanAmount, na.rm = TRUE)

test$Loan_Amount_Term[is.na(test$Loan_Amount_Term)] <- mean(test$Loan_Amount_Term, na.rm = TRUE)

# approval rate in proportion
prop.table(table(train$Loan_Status))

## Cross tabulation
library(gmodels)
CrossTable(train$Gender, train$Loan_Status, chisq = TRUE)

chisq.test(train$Gender, train$Loan_Status)