Predictive model for Bank customer churn using machine learning

This case study study is about predicting customer churn in Banks. Please download the dataset given below.

DATASET

This dataset contains following variables. 

RowNumber

CustomerId

Surname

CreditScore

Geography

Gender

Age

Tenure

Balance

NumOfProducts – How many accounts, bank account affiliated products the person has

HasCrCard – IsActiveMember

EstimatedSalary

Exited

  • Did they leave the bank after all?

Import and read the data in R

Solution

## Customer churn model

## Import data

Churn_Modelling <- read.csv("D:/ATEEQUE_NIBM/0.0 NIBM/2.0 TEACHING/1.0 MBA/2.0 Marketing research and analytics/4.1 Workshop on customer analytics in Banking/6.0 Hands on exercise_customer churn/2.0 Working data/Churn_Modelling.csv")
## View data
View(Churn_Modelling)
## Rename to dataset

dataset <- Churn_Modelling

## Check the structure
str(dataset)

## convert to factors

dataset$RowNumber <- as.factor(dataset$RowNumber)
dataset$CustomerId <- as.factor(dataset$CustomerId)
dataset$Surname <- as.factor(dataset$Surname)
dataset$Geography <- as.factor(dataset$Geography)
dataset$Tenure <- as.factor(dataset$Tenure)
dataset$HasCrCard <- as.factor(dataset$HasCrCard)
dataset$IsActiveMember <- as.factor(dataset$IsActiveMember)
dataset$Exited <- as.factor(dataset$Exited)

## Exploratory data analysis
## Frequency analysis
table(dataset$Geography)
table(dataset$HasCrCard)
table(dataset$IsActiveMember)
table(dataset$Tenure)

table(dataset$Exited)
## Descriptive analytics

summary(dataset$CreditScore)
summary(dataset$Age)
summary(dataset$Balance)
summary(dataset$NumOfProducts)
summary(dataset$EstimatedSalary)

sd(dataset$CreditScore)
sd(dataset$Age)
sd(dataset$Balance)
sd(dataset$NumOfProducts)
sd(dataset$EstimatedSalary)

boxplot(dataset$CreditScore)
boxplot(dataset$Age)
boxplot(dataset$Balance)
boxplot(dataset$NumOfProducts)
boxplot(dataset$EstimatedSalary)

hist(dataset$CreditScore)
hist(dataset$Age)
hist(dataset$Balance)
hist(dataset$NumOfProducts)
hist(dataset$EstimatedSalary)

## Missing data analysis

table(is.na(dataset$Geography))

table(is.na(dataset$HasCrCard))
table(is.na(dataset$IsActiveMember))
table(is.na(dataset$Exited))

table(is.na(dataset$CreditScore))
table(is.na(dataset$Age))
table(is.na(dataset$Tenure))
table(is.na(dataset$Balance))
table(is.na(dataset$NumOfProducts))
table(is.na(dataset$EstimatedSalary))

##Creating training and validation (test) from the given dataset
# converting "yes" "no" response values to 1, 0 values for

# Splitting the dataset into the Training set and Test set

#partition data
dataset$Exited <- as.factor(dataset$Exited)
set.seed(90)
pd <- sample(2,nrow(dataset),replace=TRUE,prob = c(0.7, 0.3))
train <- dataset[pd==1,] # means All columns
test <- dataset[pd==2,]
View(train)
View(test)
str(train)
str(test)

table(train$Exited)

table(test$Exited)

#Use the function prop.table()combined with table() to verify if the randomization process is correct.
prop.table(table(train$Exited))

prop.table(table(test$Exited))

## Logicstic regression
# Train the model using the training sets and check score
logistic_churn <- glm(formula = Exited ~ ., data = train, family ="binomial")

summary(logistic_churn)

my_log_prediction <- predict(logistic_tmc, newdata = test, type = "response")

View(my_log_prediction)

submit <- data.frame(CustomerID = test$CustomerID, y_actual = test$y, y_prob = my_log_prediction)
write.csv(submit, file = "tmc_logistic.csv", row.names = FALSE)

# Now we wish to classify based on probability

# Suppose you wish to calculate classifcation
predicted.classes <- ifelse(my_log_prediction > 0.5, "1", "0")
table(predicted.classes)

submit <- data.frame(CustomerID = test$CustomerID, y_actual = test$y, y_prob = my_log_prediction, y_class = predicted.classes)
View(submit)
str(submit)
write.csv(submit, file = "tmc_logistic1.csv", row.names = FALSE)

#missclasification error for test data sample
classification <- table(submit$y_actual,submit$y_class)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test


##Missclassfication
1-sum(diag(classification)/sum(classification))

Leave a Reply

Your email address will not be published. Required fields are marked *