Targeted Marketing Campaign for cross-sell of Term Deposits: Predicting conversion of cross-selling using machine learning

# Setting up working directory
setwd("")
# Getting working directory
getwd()
# Import the data
install.packages("readr")

library(readr)
raw_dataset <- read_csv("C:/Users/91917/Desktop/raw_dataset.csv")
View(raw_dataset)

dataset <- raw_dataset

## Data exploration

str(dataset)
head(dataset)
tail(dataset)
View(dataset)

dataset$job <- as.factor(dataset$job)

dataset$marital <- as.factor(dataset$marital)

dataset$age <- as.numeric(dataset$age)
dataset$job <- as.factor(dataset$job)
dataset$marital <- as.factor(dataset$marital)
dataset$education <- as.factor(dataset$education)
dataset$default <- as.factor(dataset$default)
dataset$balance <- as.numeric(dataset$balance)
dataset$housing <- as.factor(dataset$housing)
dataset$loan <- as.factor(dataset$loan)
dataset$contact <- as.factor(dataset$contact)
dataset$day <- as.factor(dataset$day)
dataset$month <- as.factor(dataset$month)
dataset$duration <- as.numeric(dataset$duration)
dataset$campaign <- as.numeric(dataset$campaign)
dataset$pdays <- as.numeric(dataset$pdays)
dataset$previous <- as.numeric(dataset$previous)
dataset$poutcome <- as.factor(dataset$poutcome)
dataset$y <- as.factor(dataset$y)

# Exploratory data analyssis of factor variables
table(dataset$job)
table(dataset$marital)
table(dataset$education)
table(dataset$default)
table(dataset$housing)
table(dataset$loan)
table(dataset$contact)
table(dataset$day)
table(dataset$month)
table(dataset$poutcome)
table(dataset$y)

# Exploratory data analysis for numeric / continous variables

# Descriptive statistics for continous variable

summary(dataset$age)
summary(dataset$balance)
summary(dataset$duration)
summary(dataset$campaign)
summary(dataset$pdays)
summary(dataset$previous)

sd(dataset$age)
sd(dataset$balance)
sd(dataset$duration)
sd(dataset$campaign)
sd(dataset$pdays)
sd(dataset$previous)

## Missing value analysis

# Missing value analysis

table(is.na(dataset$job))
table(is.na(dataset$marital))
table(is.na(dataset$education))
table(is.na(dataset$default))
table(is.na(dataset$housing))
table(is.na(dataset$loan))
table(is.na(dataset$contact))
table(is.na(dataset$day))
table(is.na(dataset$month))
table(is.na(dataset$poutcome))
table(is.na(dataset$y))

table(is.na(dataset$age))
table(is.na(dataset$balance))
table(is.na(dataset$duration))
table(is.na(dataset$campaign))
table(is.na(dataset$pdays))
table(is.na(dataset$previous))

# There are no missing values
# EDA - testing normality assumption # Rule of thumb
# Skewness - Symmetrical distribution
#1. Between -0.5 to 0.5 - fairly symmetrical 2.Between -0.1 to -0.5 or 1 to 0.5 - moderately skewed
# Higly skewed - less than -1 to greater than 1, -1.96 to 1.96 acceptable values
# Kurtosis -

install.packages("moments")
library(moments)
skewness(dataset$age)
kurtosis(dataset$age)

# for other variables

#Data visualization

hist(dataset$age)

# where, main is the title of the histogram
hist(dataset$age, main="Histogram for age")

hist(dataset$age, main="Histogram for age",
xlab = "age",
col = "green", breaks = 100)

# Xlab is the name of the axis
hist(dataset$age, main="Histogram for age",
xlab="age")
# Border = color of the border of the graph
hist(dataset$age, main="Histogram for age",
xlab="age",
border="blue")
# Col = color of the graph
hist(dataset$age, main="Histogram for age",
xlab="age",
border="blue",
col="green")
## Xlim = c(min,max) is the minimum and maximum value you want the graph to take
hist(dataset$age, main="Histogram for age",
xlab="age",
border="blue",
col="green",
xlim=c(0,100))
## las – A numeric value indicating the orientation of the tick mark labels
#and any other text added to a plot after its initialization.
#The options are as follows: always parallel to the axis (the default, 0),
# always horizontal (1), always perpendicular to the axis (2), and always

# Breaks - a single number giving the number of cells for the histogram
## Let us color this histogram
hist(dataset$age, main="Histogram for age",
xlab="age",
border="blue",
col="green",
xlim=c(0,100),
las=1,
breaks=10)

## Multiple colors
hist(dataset$age,
main="Histogram for age",
xlab="age",
border="blue",
col=c("red", "yellow", "green", "violet", "orange", "blue", "pink", "cyan"),
xlim=c(0,100),
las=1,
breaks=5)

##Boxplot
# Box Plot shows 5 statistically significant numbers- the minimum, the 25th percentile, the median,
# the 75th percentile and the maximum. It is thus useful for visualizing the spread of the data is
# and deriving inferences accordingly.

boxplot(dataset$age)
boxplot(dataset$age, col = "Green")

# # Bivariate box plot - left of (~) symbol = continous variable, right of (~) symbol = categorical

boxplot(dataset$age~dataset$y) #Creating Box Plot between two variable

boxplot(dataset$age~dataset$y, col = c("Blue", "Green"))

boxplot(dataset$duration,col="red")
boxplot(dataset$duration~dataset$y,col="red")

##Creating training and validation (test) from the given dataset
# converting "yes" "no" response values to 1, 0 values for

# Splitting the dataset into the Training set and Test set

#partition data
# The set. seed() function sets the starting number used to generate a sequence of
#random numbers – it ensures that you get the same result if you start with that
#same seed each time you run the same process
# replace = TRUE - makes sure that no element occurs twice/
# Brief exercises on how to choose rows and columns
set.seed(123)
pd <- sample(2,nrow(dataset),replace=TRUE,prob = c(0.7,0.3))
View(pd)
train <- dataset[pd==1,]
test <- dataset[pd==2,]

# means All columns

View(train)
str(train)
test <- dataset[pd==2,]
View(test)
length(train)
str(train)

table(train$y)

table(test$y)

#Use the function prop.table()combined with table() to verify if the randomization process is correct.
prop.table(table(train$y))

prop.table(table(test$y))
## the proportion is similar

prop.table(table(dataset$y))

## Logistic regression
# Train the model using the training sets and check score
#Family = binomial = link function is "logit"
logistic_tmc <- glm(formula = y ~ ., data = train, family ="binomial")

summary(logistic_tmc)

View(logistic_tmc)

## use the summary to find out significant factors
# Significant factors / variables are age marital balance, month, pdays duration campaign previous
## Use these variables to build the model to predict the offer acceptance probability for the test data
logistic_tmc <- glm(formula = y ~ age + balance + month + duration + campaign + contact, data = train, family ="binomial")

View(test)
my_log_prediction <- predict(logistic_tmc, newdata = test, type = "response")

## type of prediction required= c("link", "response", "terms")
# The default is on the scale of the linear predictors; the alternative "response" is on the scale of the response variable.
# type = "response" gives the predicted probabilities
# The "terms" option returns a matrix giving the fitted values of each term in the model formula on the linear predictor scale.

View(my_log_prediction)

submit <- data.frame(CustomerID = test$CustomerID, y_actual = test$y, y_prob = my_log_prediction)
View(submit)
View(test)
write.csv(submit, file = "tmc_logistic.csv", row.names = FALSE)

# Now we wish to classify based on probability

# Suppose you wish to calculate classifcation
predicted.classes <- ifelse(my_log_prediction > 0.5, "1", "0")

View(predicted.classes)
table(predicted.classes)

submit <- data.frame(CustomerID = test$CustomerID, y_actual = test$y, y_prob = my_log_prediction, y_class = predicted.classes)
View(submit)
str(submit)
write.csv(submit, file = "tmc_logistic1.csv", row.names = FALSE)


#missclasification error for test data sample
classification <- table(submit$y_actual,submit$y_class)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test


##Missclassfication
1-sum(diag(classification)/sum(classification))




## Decision trees
## decision trees

install.packages('rattle')
install.packages('rpart.plot')
install.packages('RColorBrewer')
install.packages("rpart")

library(rpart)
library(rattle)
library(rpart.plot)




fit <- rpart(y ~ ., data = train, method="class")
View(fit)
# examine the tree
# better visualization of the tree
plot(fit)
text(fit)

fancyRpartPlot(fit)
summary(fit)
fit$variable.importance

fit1 <- rpart(y ~ duration + poutcome + day + month +age + pdays + job,data = train, method="class")

# Gini index or information gain # Choose variable based on variable importance

Prediction <- predict(fit, test, type = "class")
submit <- data.frame(CustomerID = test$CustomerID, y_actual = test$y, y_class = Prediction)
write.csv(submit, file = "myfirsttmctree.csv", row.names = FALSE)


#missclasification error for test data sample
# Accuracy test
classification <- table(submit$y_actual, submit$y_class)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test
##
##Missclassfication
1-sum(diag(missclss)/sum(missclss))
# afte running this we see the tabular classiffication and can decide wheteher model need to update or include variables
# here after running this we conclude this yes we need to improve the model as error in predicting 0 to 1 is very high

## tuning the parameters

## rpart.control(minsplit = 400, minbucket = round(minsplit/3), maxdepth = 30)
##Arguments:
# -minsplit: Set the minimum number of observations in the node before the algorithm perform a split
#-minbucket: Set the minimum number of observations in the final note i.e. the leaf
#-maxdepth: Set the maximum depth of any node of the final tree. The root node is treated a depth 0


## Random forest
# Random forest is an ensemble of decision trees
# Each decision tree gives a vote for the prediction of target variable.
#Random forest chooses the prediction that gets the most vote.
install.packages("randomForest")
library("randomForest")

rfm_model <- randomForest(formula = y ~., data = train)

print(rfm_model)
# By default, number of trees is 500 and number of variables tried at each split is 4

# Importance of each predictor.
#Variables with high importance are drivers of the outcome
#Variables with low importance might be omitted from a model, making it simpler
#This importance is a measure of by how much removing a variable decreases accuracy,
#and vice versa — by how much including a variable increases accuracy.
rfm_model$importance

# from the output we conclude age + job + balance + day + duration + campaign + poutcome + month are important predictors of
# cross sell acceptance
rfm_model2 <- randomForest(formula = y ~ age + job + balance + day + duration + pdays + poutcome + month, data = train, ntree = 500, mtry = 4, importance = TRUE)
print(rfm_model2)
# Fine tuning parameters of Random Forest model
# mtry = number of variables tried at each split
#ntree = number of decision trees, you fine tune these parameters to improve the accuracy and to
# decrease the error rate in prediction
#Extract variable importance measure


# There is no improvement in the accuracy with this data, however generally the accuracy improves

# Using this model to predict on the test data

# Predicting on train set
prediction_RFM <- predict(rfm_model2, newdata = test, type = "class")
View(prediction_RFM)

submit <- data.frame(CustomerID = test$CustomerID, y_actual = test$y, y_class = prediction_RFM)
write.csv(submit, file = "tmc_rfm.csv", row.names = FALSE)
# Checking classification accuracy
table(submit$y_actual, submit$y_class)
classification <- table(submit$y_actual, submit$y_class)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test
View(submit)
##Missclassfication
1-sum(diag(classification)/sum(classification))


## Support vector machines - e1071 package contains the svm function
install.packages('e1071')
library(e1071)


classifier = svm(formula = y ~ .,
data = train,
type = 'C-classification',
kernel = 'linear')


classifier
summary(classifier)
View(classifier)

# Using the model to predict for the test data

prediction_svm = predict(classifier, newdata = test)

prediction_svm

submit <- data.frame(CustomerID = test$CustomerID, y_actual = test$y, y_class = prediction_svm)
write.csv(submit, file = "tmc_svm.csv", row.names = FALSE)

#missclasification error for test data sample

table(submit$y_actual, submit$y_class)
classification <- table(submit$y_actual, submit$y_class)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test

##Missclassfication
1-sum(diag(classification)/sum(classification))

Leave a Reply

Your email address will not be published. Required fields are marked *