## Case description

This case study will discuss ways to take credit decisions whether to sanction a loan or not based on the customers data using R programming.

Data / Variables. This case study uses data from analyticsvidhya.com

Problem – Whether to approve loan or not (Y / N)

### Solution with R codes

1. Download and save the above dataset on your desktop.

2. Create a project in R

3. Link the saved dataset and set up the working directory

R codes

Read the data into R

#read.csv coerces strings to factors

# Print train and test to the console
train
test

View(train)
View(test)

# Exploring the variables in the dataset

# Exploring the data types in the data set
str(train)
str(test)

# Exploratory data anaysis
# Converting categorical variables into factors using command as.factor
# convert all int variable into numeric as.numeric

train\$Gender <- as.factor(train\$Gender)
train\$Married <- as.factor(train\$Married)
train\$Dependents <- as.factor(train\$Dependents)
train\$Education <- as.factor(train\$Education)
train\$Self_Employed <- as.factor(train\$Self_Employed)
train\$ApplicantIncome <- as.numeric(train\$ApplicantIncome)
train\$CoapplicantIncome <- as.numeric(train\$CoapplicantIncome)
train\$LoanAmount <- as.numeric(train\$LoanAmount)
train\$Loan_Amount_Term <- as.numeric(train\$Loan_Amount_Term)
train\$Credit_History <- as.factor(train\$Credit_History)
train\$Property_Area <- as.factor(train\$Property_Area)
train\$Loan_Status <- as.factor(train\$Loan_Status)

# Frequency tables for categorical variables

table(train\$Gender)
table(train\$Married)
table(train\$Dependents)
table(train\$Education)
table(train\$Self_Employed)
table(train\$Credit_History)
table(train\$Property_Area)

# approval rate in numbers
table(train\$Loan_Status)

# descriptive statistics for continous variables
summary(train\$ApplicantIncome)
mean(train\$ApplicantIncome)
sd(train\$ApplicantIncome)

hist(train\$ApplicantIncome)
boxplot(train\$ApplicantIncome)

summary(train\$CoapplicantIncome)
mean(train\$CoapplicantIncome)
sd(train\$CoapplicantIncome)

summary(train\$LoanAmount)
mean(train\$LoanAmount) ## This will give NA, include na.rm = TRUE to deal with this
mean(train\$LoanAmount, na.rm = TRUE)
sd(train\$LoanAmount, na.rm = TRUE)

summary(train\$Loan_Amount_Term)
mean(train\$Loan_Amount_Term, na.rm = TRUE)
sd(train\$Loan_Amount_Term, na.rm = TRUE)

# for test data

test\$Gender <- as.factor(test\$Gender)
test\$Married <- as.factor(test\$Married)
test\$Dependents <- as.factor(test\$Dependents)
test\$Education <- as.factor(test\$Education)
test\$Self_Employed <- as.factor(test\$Self_Employed)
test\$Credit_History <- as.factor(test\$Credit_History)
test\$Property_Area <- as.factor(test\$Property_Area)
test\$ApplicantIncome <- as.numeric(test\$ApplicantIncome)
test\$CoapplicantIncome <- as.numeric(test\$CoapplicantIncome)
test\$LoanAmount <- as.numeric(test\$LoanAmount)
test\$Loan_Amount_Term <- as.numeric(test\$Loan_Amount_Term)

## Descriptive statistics for test data

table(test\$Married)
table(test\$Gender)
table(test\$Dependents)
table(test\$Education)
table(test\$Self_Employed)
table(test\$Credit_History)
table(test\$Property_Area)

# descriptive statistics for continous variables

summary(test\$ApplicantIncome)
mean(test\$ApplicantIncome)
sd(test\$ApplicantIncome)

summary(test\$CoapplicantIncome)
mean(test\$CoapplicantIncome)
sd(test\$CoapplicantIncome)

summary(test\$LoanAmount)
mean(test\$LoanAmount, na.rm = TRUE)
sd(test\$LoanAmount, na.rm = TRUE)

summary(test\$Loan_Amount_Term)
mean(test\$Loan_Amount_Term, na.rm = TRUE)
sd(test\$Loan_Amount_Term, na.rm = TRUE)

# missing values analysis
# the dataset has space as missing value rather than NA value. R doesn't consider space as missing values.
# Therefore we have to give command to R that 'space' is a missing value in the dataset
# Thereforea replace 'space' with NA

train\$Gender[train\$Gender==""] <- NA
# Check the frequency of missing values
table(is.na(train\$Gender))

# TRUE indicates number of missing values

train\$Married[train\$Married==""] <- NA
table(is.na(train\$Married))

train\$Dependents[train\$Dependents==""] <- NA
table(is.na(train\$Dependents))

train\$Education[train\$Education==""] <- NA
table(is.na(train\$Education))

train\$Self_Employed[train\$Self_Employed==""] <- NA
table(is.na(train\$Self_Employed))

train\$ApplicantIncome[train\$ApplicantIncome==""] <- NA
table(is.na(train\$ApplicantIncome))

train\$CoapplicantIncome[train\$CoapplicantIncome==""] <- NA
table(is.na(train\$CoapplicantIncome))

train\$LoanAmount[train\$LoanAmount==""] <- NA
table(is.na(train\$LoanAmount))

train\$Loan_Amount_Term[train\$Loan_Amount_Term==""] <- NA
table(is.na(train\$Loan_Amount_Term))

train\$Credit_History[train\$Credit_History==""] <- NA
table(is.na(train\$Credit_History))

train\$Property_Area[train\$Property_Area==""] <- NA
table(is.na(train\$Property_Area))

## Missing values for test data

# missing values analysis
# the dataset has space as missing value rather than NA value. R doesn't consider space as missing values.
# Therefore we have to give command to R that 'space' is a missing value in the dataset
# Thereforea replace 'space' with NA

test\$Gender[test\$Gender==""] <- NA
# Check the frequency of missing values
table(is.na(test\$Gender))

# TRUE indicates number of missing values

test\$Married[test\$Married==""] <- NA
table(is.na(test\$Married))

test\$Dependents[test\$Dependents==""] <- NA
table(is.na(test\$Dependents))

test\$Education[test\$Education==""] <- NA
table(is.na(test\$Education))

test\$Self_Employed[test\$Self_Employed==""] <- NA
table(is.na(test\$Self_Employed))

test\$ApplicantIncome[test\$ApplicantIncome==""] <- NA
table(is.na(test\$ApplicantIncome))

test\$CoapplicantIncome[test\$CoapplicantIncome==""] <- NA
table(is.na(test\$CoapplicantIncome))

test\$LoanAmount[test\$LoanAmount==""] <- NA
table(is.na(test\$LoanAmount))

test\$Loan_Amount_Term[test\$Loan_Amount_Term==""] <- NA
table(is.na(test\$Loan_Amount_Term))

test\$Credit_History[test\$Credit_History==""] <- NA
table(is.na(test\$Credit_History))

test\$Property_Area[test\$Property_Area==""] <- NA
table(is.na(test\$Property_Area))

## Handling missing values
# The first option is to delete / drop the missing values from the analysis
train1 <- na.omit(train)
str(train1)

test1 <- na.omit(test)
str(test1)
# Since we have less observation we will not drop drop the missing values

## Missing value imputation
# From the above analysis, we got to know that following variables have missing values
# Gender, Married, Dependents, Self_employed, LoanAmount, Loan_Amount_Term, Credit_History

train\$Gender[is.na(train\$Gender)] <- "Male"
train\$Married[is.na(train\$Married)] <- "Yes"
train\$Dependents[is.na(train\$Dependents)] <- "0"
train\$Self_Employed[is.na(train\$Dependents)] <- "0"
train\$Self_Employed[is.na(train\$Self_Employed)] <- "No"
train\$Credit_History[is.na(train\$Credit_History)] <- "1"
train\$LoanAmount[is.na(train\$LoanAmount)] <- mean(train\$LoanAmount, na.rm = TRUE)

train\$Loan_Amount_Term[is.na(train\$Loan_Amount_Term)] <- mean(train\$Loan_Amount_Term, na.rm = TRUE)

# For test data

test\$Gender[is.na(test\$Gender)] <- "Male"
test\$Dependents[is.na(test\$Dependents)] <- "0"
test\$Self_Employed[is.na(test\$Self_Employed)] <- "No"
test\$Credit_History[is.na(test\$Credit_History)] <- "1"
test\$LoanAmount[is.na(test\$LoanAmount)] <- mean(test\$LoanAmount, na.rm = TRUE)

test\$Loan_Amount_Term[is.na(test\$Loan_Amount_Term)] <- mean(test\$Loan_Amount_Term, na.rm = TRUE)

# approval rate in proportion
prop.table(table(train\$Loan_Status))

## Cross tabulation
library(gmodels)
CrossTable(train\$Gender, train\$Loan_Status, chisq = TRUE)

chisq.test(train\$Gender, train\$Loan_Status)