 ## Case description

This case study will discuss ways to take credit decisions whether to sanction a loan or not based on the customers data using R programming.

Data / Variables. This case study uses data from analyticsvidhya.com

Problem – Whether to approve loan or not (Y / N)

### Solution with R codes

2. Create a project in R

3. Link the saved dataset and set up the working directory

R codes

`#read.csv coerces strings to factors`
`train <- read.csv("train.csv", header=TRUE)`
`test <- read.csv("test.csv", header=TRUE)`

`# Print train and test to the console`
`train`
`test`

`View(train)`
`View(test)`

`# Exploring the variables in the dataset`
`head(train)`
`head(test)`

`# Exploring the data types in the data set`
`str(train)`
`str(test)`

`# Exploratory data anaysis`
`# Converting categorical variables into factors using command as.factor`
`# convert all int variable into numeric as.numeric`

`train\$Gender <- as.factor(train\$Gender)`
`train\$Married <- as.factor(train\$Married)`
`train\$Dependents <- as.factor(train\$Dependents)`
`train\$Education <- as.factor(train\$Education)`
`train\$Self_Employed <- as.factor(train\$Self_Employed)`
`train\$ApplicantIncome <- as.numeric(train\$ApplicantIncome)`
`train\$CoapplicantIncome <- as.numeric(train\$CoapplicantIncome)`
`train\$LoanAmount <- as.numeric(train\$LoanAmount)`
`train\$Loan_Amount_Term <- as.numeric(train\$Loan_Amount_Term)`
`train\$Credit_History <- as.factor(train\$Credit_History)`
`train\$Property_Area <- as.factor(train\$Property_Area)`
`train\$Loan_Status <- as.factor(train\$Loan_Status)`

`# Frequency tables for categorical variables`

`table(train\$Gender)`
`table(train\$Married)`
`table(train\$Dependents)`
`table(train\$Education)`
`table(train\$Self_Employed)`
`table(train\$Credit_History)`
`table(train\$Property_Area)`

`# approval rate in numbers`
`table(train\$Loan_Status)`

`# descriptive statistics for continous variables`
`summary(train\$ApplicantIncome)`
`mean(train\$ApplicantIncome)`
`sd(train\$ApplicantIncome)`

`hist(train\$ApplicantIncome)`
`boxplot(train\$ApplicantIncome)`

`summary(train\$CoapplicantIncome)`
`mean(train\$CoapplicantIncome)`
`sd(train\$CoapplicantIncome)`

`summary(train\$LoanAmount)`
`mean(train\$LoanAmount) ## This will give NA, include na.rm = TRUE to deal with this`
`mean(train\$LoanAmount, na.rm = TRUE)`
`sd(train\$LoanAmount, na.rm = TRUE)`

`summary(train\$Loan_Amount_Term)`
`mean(train\$Loan_Amount_Term, na.rm = TRUE)`
`sd(train\$Loan_Amount_Term, na.rm = TRUE)`

`# for test data`

`test\$Gender <- as.factor(test\$Gender)`
`test\$Married <- as.factor(test\$Married)`
`test\$Dependents <- as.factor(test\$Dependents)`
`test\$Education <- as.factor(test\$Education)`
`test\$Self_Employed <- as.factor(test\$Self_Employed)`
`test\$Credit_History <- as.factor(test\$Credit_History)`
`test\$Property_Area <- as.factor(test\$Property_Area)`
`test\$ApplicantIncome <- as.numeric(test\$ApplicantIncome)`
`test\$CoapplicantIncome <- as.numeric(test\$CoapplicantIncome)`
`test\$LoanAmount <- as.numeric(test\$LoanAmount)`
`test\$Loan_Amount_Term <- as.numeric(test\$Loan_Amount_Term)`

`## Descriptive statistics for test data`

`table(test\$Married)`
`table(test\$Gender)`
`table(test\$Dependents)`
`table(test\$Education)`
`table(test\$Self_Employed)`
`table(test\$Credit_History)`
`table(test\$Property_Area)`

`# descriptive statistics for continous variables`

`summary(test\$ApplicantIncome)`
`mean(test\$ApplicantIncome)`
`sd(test\$ApplicantIncome)`

`summary(test\$CoapplicantIncome)`
`mean(test\$CoapplicantIncome)`
`sd(test\$CoapplicantIncome)`

`summary(test\$LoanAmount)`
`mean(test\$LoanAmount, na.rm = TRUE)`
`sd(test\$LoanAmount, na.rm = TRUE)`

`summary(test\$Loan_Amount_Term)`
`mean(test\$Loan_Amount_Term, na.rm = TRUE)`
`sd(test\$Loan_Amount_Term, na.rm = TRUE)`

`# missing values analysis`
`# the dataset has space as missing value rather than NA value. R doesn't consider space as missing values.`
`# Therefore we have to give command to R that 'space' is a missing value in the dataset`
`# Thereforea replace 'space' with NA`

`train\$Gender[train\$Gender==""] <- NA`
`# Check the frequency of missing values`
`table(is.na(train\$Gender))`

`# TRUE indicates number of missing values`

`train\$Married[train\$Married==""] <- NA`
`table(is.na(train\$Married))`

`train\$Dependents[train\$Dependents==""] <- NA`
`table(is.na(train\$Dependents))`

`train\$Education[train\$Education==""] <- NA`
`table(is.na(train\$Education))`

`train\$Self_Employed[train\$Self_Employed==""] <- NA`
`table(is.na(train\$Self_Employed))`

`train\$ApplicantIncome[train\$ApplicantIncome==""] <- NA`
`table(is.na(train\$ApplicantIncome))`

`train\$CoapplicantIncome[train\$CoapplicantIncome==""] <- NA`
`table(is.na(train\$CoapplicantIncome))`

`train\$LoanAmount[train\$LoanAmount==""] <- NA`
`table(is.na(train\$LoanAmount))`

`train\$Loan_Amount_Term[train\$Loan_Amount_Term==""] <- NA`
`table(is.na(train\$Loan_Amount_Term))`

`train\$Credit_History[train\$Credit_History==""] <- NA`
`table(is.na(train\$Credit_History))`

`train\$Property_Area[train\$Property_Area==""] <- NA`
`table(is.na(train\$Property_Area))`

`## Missing values for test data`

`# missing values analysis`
`# the dataset has space as missing value rather than NA value. R doesn't consider space as missing values.`
`# Therefore we have to give command to R that 'space' is a missing value in the dataset`
`# Thereforea replace 'space' with NA`

`test\$Gender[test\$Gender==""] <- NA`
`# Check the frequency of missing values`
`table(is.na(test\$Gender))`

`# TRUE indicates number of missing values`

`test\$Married[test\$Married==""] <- NA`
`table(is.na(test\$Married))`

`test\$Dependents[test\$Dependents==""] <- NA`
`table(is.na(test\$Dependents))`

`test\$Education[test\$Education==""] <- NA`
`table(is.na(test\$Education))`

`test\$Self_Employed[test\$Self_Employed==""] <- NA`
`table(is.na(test\$Self_Employed))`

`test\$ApplicantIncome[test\$ApplicantIncome==""] <- NA`
`table(is.na(test\$ApplicantIncome))`

`test\$CoapplicantIncome[test\$CoapplicantIncome==""] <- NA`
`table(is.na(test\$CoapplicantIncome))`

`test\$LoanAmount[test\$LoanAmount==""] <- NA`
`table(is.na(test\$LoanAmount))`

`test\$Loan_Amount_Term[test\$Loan_Amount_Term==""] <- NA`
`table(is.na(test\$Loan_Amount_Term))`

`test\$Credit_History[test\$Credit_History==""] <- NA`
`table(is.na(test\$Credit_History))`

`test\$Property_Area[test\$Property_Area==""] <- NA`
`table(is.na(test\$Property_Area))`

`## Handling missing values`
`# The first option is to delete / drop the missing values from the analysis`
`train1 <- na.omit(train)`
`str(train1)`

`test1 <- na.omit(test)`
`str(test1)`
`# Since we have less observation we will not drop drop the missing values`

`## Missing value imputation`
`# From the above analysis, we got to know that following variables have missing values`
`# Gender, Married, Dependents, Self_employed, LoanAmount, Loan_Amount_Term, Credit_History`

`train\$Gender[is.na(train\$Gender)] <- "Male"`
`train\$Married[is.na(train\$Married)] <- "Yes"`
`train\$Dependents[is.na(train\$Dependents)] <- "0"`
`train\$Self_Employed[is.na(train\$Dependents)] <- "0"`
`train\$Self_Employed[is.na(train\$Self_Employed)] <- "No"`
`train\$Credit_History[is.na(train\$Credit_History)] <- "1"`
`train\$LoanAmount[is.na(train\$LoanAmount)] <- mean(train\$LoanAmount, na.rm = TRUE)`

`train\$Loan_Amount_Term[is.na(train\$Loan_Amount_Term)] <- mean(train\$Loan_Amount_Term, na.rm = TRUE)`

`# For test data`

`test\$Gender[is.na(test\$Gender)] <- "Male"`
`test\$Dependents[is.na(test\$Dependents)] <- "0"`
`test\$Self_Employed[is.na(test\$Self_Employed)] <- "No"`
`test\$Credit_History[is.na(test\$Credit_History)] <- "1"`
`test\$LoanAmount[is.na(test\$LoanAmount)] <- mean(test\$LoanAmount, na.rm = TRUE)`

`test\$Loan_Amount_Term[is.na(test\$Loan_Amount_Term)] <- mean(test\$Loan_Amount_Term, na.rm = TRUE)`

`# approval rate in proportion`
`prop.table(table(train\$Loan_Status))`

`## Cross tabulation`
`library(gmodels)`
`CrossTable(train\$Gender, train\$Loan_Status, chisq = TRUE)`

`chisq.test(train\$Gender, train\$Loan_Status)`