Predicting Telecom customer Churn: Machine learning in R

setwd("")

library(readr)

WA_Fn_UseC_Telco_Customer_Churn <- read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

View(WA_Fn_UseC_Telco_Customer_Churn)

dataset <- WA_Fn_UseC_Telco_Customer_Churn

str(dataset)
head(dataset)
tail(dataset)
View(dataset)

# Converting to factors

dataset$gender <- as.factor(dataset$gender)
dataset$SeniorCitizen <- as.factor(dataset$SeniorCitizen)
dataset$Partner <- as.factor(dataset$Partner)
dataset$Dependents <- as.factor(dataset$Dependents)
dataset$PhoneService <- as.factor(dataset$PhoneService)
dataset$MultipleLines <- as.factor(dataset$MultipleLines)
dataset$InternetService <- as.factor(dataset$InternetService)
dataset$OnlineSecurity <- as.factor(dataset$OnlineSecurity)
dataset$OnlineBackup <- as.factor(dataset$OnlineBackup)
dataset$DeviceProtection <- as.factor(dataset$DeviceProtection)
dataset$TechSupport <- as.factor(dataset$TechSupport)
dataset$StreamingTV <- as.factor(dataset$StreamingTV)
dataset$StreamingMovies <- as.factor(dataset$StreamingMovies)
dataset$Contract <- as.factor(dataset$Contract)
dataset$PaperlessBilling <- as.factor(dataset$PaperlessBilling)
dataset$PaymentMethod <- as.factor(dataset$PaymentMethod)
dataset$Churn <- as.factor(dataset$Churn)

# Numerical data correctly classified hence change not required

# EDA -categorical data univariate analysis
table(dataset$gender)
table(dataset$SeniorCitizen)
table(dataset$Partner)
table(dataset$Dependents)
table(dataset$PhoneService)
table(dataset$MultipleLines)
table(dataset$InternetService)
table(dataset$OnlineSecurity)
table(dataset$OnlineBackup)
table(dataset$DeviceProtection)
table(dataset$TechSupport)
table(dataset$StreamingTV)
table(dataset$StreamingMovies)
table(dataset$Contract)
table(dataset$PaperlessBilling)
table(dataset$PaymentMethod)
table(dataset$Churn)

# EDA -continuous data / univariate analysis
mean(dataset$tenure)
mean(dataset$MonthlyCharges)
mean(dataset$TotalCharges)
# total charges has missing values # na.rm = TRUE, ignore missing values and calculate mean
mean(dataset$TotalCharges, na.rm = TRUE)

sd(dataset$tenure)
sd(dataset$MonthlyCharges)
sd(dataset$TotalCharges, na.rm = TRUE)

summary(dataset$tenure)
summary(dataset$MonthlyCharges)
summary(dataset$TotalCharges)

# Missing value analysis

table(is.na(dataset$gender))
table(is.na(dataset$SeniorCitizen))
table(is.na(dataset$Partner))
table(is.na(dataset$Dependents))
table(is.na(dataset$PhoneService))
table(is.na(dataset$MultipleLines))
table(is.na(dataset$InternetService))
table(is.na(dataset$OnlineSecurity))
table(is.na(dataset$OnlineBackup))
table(is.na(dataset$DeviceProtection))
table(is.na(dataset$TechSupport))
table(is.na(dataset$StreamingTV))
table(is.na(dataset$StreamingMovies))
table(is.na(dataset$Contract))
table(is.na(dataset$PaperlessBilling))
table(is.na(dataset$PaymentMethod))
table(is.na(dataset$Churn))
table(is.na (dataset$tenure))
table(is.na (dataset$MonthlyCharges))
table(is.na (dataset$TotalCharges))
# Missing values in total charges

# What do with the missing values?
# Mean imputation
view(dataset)

mean(dataset$TotalCharges, na.rm = TRUE)
dataset$TotalCharges[is.na(dataset$TotalCharges)] <- mean(dataset$TotalCharges, na.rm = TRUE)





mean(dataset$TotalCharges)

#creating skewness, kurtosis, boxplot etc.
#how to test normality assumption
install.packages("e1071")
library(e1071)
#-0.5 to 0.5= slightly skewed, -1 to -0.5 or 0.5 to 1 moderately
#-1.96 to +1.96 acceptable value

skewness(dataset$tenure)
kurtosis(dataset$tenure)
skewness(dataset$MonthlyCharges)
kurtosis(dataset$MonthlyCharges)
skewness(dataset$TotalCharges)
kurtosis(dataset$TotalCharges)
# All values within acceptable limits

## #ploting a histogram

hist(dataset$TotalCharges)
hist(dataset$MonthlyCharges)
hist(dataset$tenure)

# Boxplot

boxplot(dataset$TotalCharges)
boxplot(dataset$MonthlyCharges)
boxplot(dataset$tenure)
View(dataset)


# converting "yes" "no" response values to 1, 0 value
# Explain with and without quotes "1" and 1

dataset$Churn <- ifelse(dataset$Churn == "Yes", "1", "0")
View(dataset)
levels(dataset$Churn)

dataset$Churn <- as.factor(dataset$Churn)
str(dataset)

table(dataset$Churn)
#Creating training and validation (test) from the given dataset
# Splitting the dataset into the Train set and Test set
#partition data
# The set. seed() function sets the starting number used to generate a sequence of
#random numbers – it ensures that you get the same result if you start with that
#same seed each time you run the same process
# replace = TRUE - makes sure that no element occurs twice/
set.seed(123)
pd <- sample(2,nrow(dataset),replace=TRUE,prob = c(0.7,0.3))
View(pd)
train <- dataset[pd==1,]
test <- dataset[pd==2,]

# means All columns

View(train)
str(train)
test <- dataset[pd==2,]
View(test)
str(test)

table(train$Churn)

table(test$Churn)

#Use the function prop.table()combined with table() to verify if the randomization process is correct.
prop.table(table(train$Churn))

prop.table(table(test$Churn))
## the proportion is similar

prop.table(table(dataset$Churn))
View(train)
## Stacked bar plot
#Gender
table(train$gender, train$Churn)
counts <- table(train$gender, train$Churn)
barplot(counts, main="Churn with respect to Gender",
xlab="Churn", col=c("darkblue","red"),
legend = rownames(counts), beside = TRUE)

# 2 way cross tabulation
install.packages("gmodels")
library(gmodels)

CrossTable(train$gender, train$Churn)
chisq.test(train$gender, train$Churn)
# p value is 0.24 not less than 0.05 hence no significant relationship
# Stacked bar plot also suggests the same thing

# Senior citizen

counts <- table(train$SeniorCitizen, train$Churn)
barplot(counts, main="Churn with respect to Gender",
xlab="Churn", col=c("darkblue","red"),
legend = rownames(counts), beside = TRUE)

CrossTable(train$SeniorCitizen, train$Churn)
chisq.test(train$SeniorCitizen, train$Churn)

## Other Cross tabulation
CrossTable(train$Partner, train$Churn)
CrossTable(train$Dependents, train$Churn)
CrossTable(train$PhoneService, train$Churn)
CrossTable(train$MultipleLines, train$Churn)
CrossTable(train$InternetService, train$Churn)
CrossTable(train$OnlineSecurity, train$Churn)
CrossTable(train$OnlineBackup, train$Churn)
CrossTable(train$DeviceProtection, train$Churn)
CrossTable(train$TechSupport, train$Churn)
CrossTable(train$StreamingTV, train$Churn)
CrossTable(train$StreamingMovies, train$Churn)
CrossTable(train$Contract, train$Churn)
CrossTable(train$PaperlessBilling, train$Churn)
CrossTable(train$PaymentMethod, train$Churn)

# chisquare test

chisq.test(train$Partner, train$Churn) # Significant
chisq.test(train$Dependents, train$Churn) # Significant
chisq.test(train$PhoneService, train$Churn) # Insignificant
chisq.test(train$MultipleLines, train$Churn) # Significant
chisq.test(train$InternetService, train$Churn) # Significant
chisq.test(train$OnlineSecurity, train$Churn) # Significant
chisq.test(train$OnlineBackup, train$Churn) # Significant
chisq.test(train$DeviceProtection, train$Churn) # Significant
chisq.test(train$TechSupport, train$Churn) # Significant
chisq.test(train$StreamingTV, train$Churn) # Significant
chisq.test(train$StreamingMovies, train$Churn) # Significant
chisq.test(train$Contract, train$Churn) # Significant
chisq.test(train$PaperlessBilling, train$Churn) # Significant
chisq.test(train$PaymentMethod, train$Churn) # Significant

#Significance of continous variable with churn

Tenure <- glm(formula = Churn~ tenure, data = train, family = "binomial")
summary(Tenure)

MonthlyCharges <- glm(formula = Churn~ MonthlyCharges, data = train, family = "binomial")
summary(MonthlyCharges) # Significant
TotalCharges <- glm(formula = Churn~ TotalCharges, data = train, family = "binomial")
summary(TotalCharges) # Significant

# Machine learning model


Logistic_churn <- glm(formula = Churn~
Partner + Dependents + MultipleLines + SeniorCitizen +
InternetService + OnlineSecurity + OnlineBackup +
DeviceProtection + TechSupport + StreamingTV +
StreamingMovies + Contract + PaperlessBilling +
PaymentMethod + tenure + TotalCharges + MonthlyCharges,
data = train, family = "binomial")

summary(Logistic_churn)
# Significant variables Dependents + MultipleLines + InternetService + StreamingTV + Contract + PaperlessBilling + PaymentMethod + tenure + TotalCharges

Logistic_churn2 <- glm(formula = Churn~
Dependents + MultipleLines +
InternetService + StreamingTV +
StreamingMovies + Contract + PaperlessBilling +
PaymentMethod + tenure + TotalCharges + MonthlyCharges,
data = train, family = "binomial")

View(train)
View(test)
my_log_prediction <- predict(Logistic_churn2, newdata = test, type = "response")
View(my_log_prediction)
## type of prediction required= c("link", "response", "terms")
# The default is on the scale of the linear predictors; the alternative "response" is on the scale of the response variable.
# type = "response" gives the predicted probabilities
# The "terms" option returns a matrix giving the fitted values of each term in the model formula on the linear predictor scale.
Churn_predicted <- ifelse(my_log_prediction > 0.5, "1", "0")
View(Churn_predicted)
table(Churn_predicted)

submit <- data.frame(CustomerID = test$customerID, Churn_actual = test$Churn, Churn_predicted = Churn_predicted)
View(submit)
write.csv(submit, file = "Logistic_churn.csv", row.names = FALSE)

# Evaluation
#missclasification error for test data sample
classification <- table(submit$Churn_actual, Churn_predicted)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test

install.packages("expss")
library(expss)
classification <- cro(submit$Churn_actual, Churn_predicted)

##Missclassfication
1-sum(diag(classification)/sum(classification))

# decision trees

library(rpart)
library(rattle)
library(rpart.plot)


Churn_tree <- rpart(Churn ~ Dependents + MultipleLines +
InternetService + StreamingTV +
StreamingMovies + Contract + PaperlessBilling +
PaymentMethod + tenure + TotalCharges + MonthlyCharges,
data = train, method="class")
View(Churn_tree)
# examine the tree
# better visualization of the tree
plot(Churn_tree)
text(Churn_tree)

fancyRpartPlot(Churn_tree)
summary(Churn_tree)
Churn_tree$variable.importance

Prediction_tree <- predict(Churn_tree, test, type = "class")

submit <- data.frame(CustomerID = test$customerID, Churn_actual = test$Churn, Churn_predicted = Prediction_tree)
View(submit)
write.csv(submit, file = "Churn_tree.csv", row.names = FALSE)

# Evaluation
#missclasification error for test data sample
classification <- table(submit$Churn_actual, Churn_predicted)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test

## Random forest
# Random forest is an ensemble of decision trees
# Each decision tree gives a vote for the prediction of target variable.
#Random forest chooses the prediction that gets the most vote.
install.packages("randomForest")
library("randomForest")

Churn_rfm <- randomForest(formula = Churn ~Dependents + MultipleLines +
InternetService + StreamingTV +
StreamingMovies + Contract + PaperlessBilling +
PaymentMethod + tenure + TotalCharges + MonthlyCharges,
data = train)

print(Churn_rfm)
# By default, number of trees is 500 and number of variables tried at each split is 4

# Importance of each predictor.
#Variables with high importance are drivers of the outcome
#Variables with low importance might be omitted from a model, making it simpler
#This importance is a measure of by how much removing a variable decreases accuracy,
#and vice versa — by how much including a variable increases accuracy.
Churn_rfm$importance
# Would like to build another random forest?
# Fine tuning parameters of Random Forest model
# mtry = number of variables tried at each split
#ntree = number of decision trees, you fine tune these parameters to improve the accuracy and to
# decrease the error rate in prediction
#Extract variable importance measure

# There is no improvement in the accuracy with this data, however generally the accuracy improves

# Using this model to predict on the test data

# Predicting on train set
prediction_RFM <- predict(Churn_rfm, newdata = test, type = "class")
View(prediction_RFM)

submit <- data.frame(CustomerID = test$customerID, Churn_actual = test$Churn, Churn_predicted = prediction_RFM)
View(submit)
write.csv(submit, file = "Churn_tree.csv", row.names = FALSE)

# Evaluation
#missclasification error for test data sample
classification <- table(submit$Churn_actual, Churn_predicted)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test

## Support vector machines - e1071 package contains the svm function
install.packages('e1071')
library(e1071)


Churn_svm = svm(formula = Churn ~ Dependents + MultipleLines +
InternetService + StreamingTV +
StreamingMovies + Contract + PaperlessBilling +
PaymentMethod + tenure + TotalCharges + MonthlyCharges,
data = train,
type = 'C-classification',
kernel = 'linear')


Churn_svm
summary(Churn_svm)
View(Churn_svm)

# Using the model to predict for the test data

prediction_svm = predict(Churn_svm, newdata = test)

prediction_svm
submit <- data.frame(CustomerID = test$customerID, Churn_actual = test$Churn, Churn_predicted = prediction_svm)
View(submit)
write.csv(submit, file = "Churn_tree.csv", row.names = FALSE)

# Evaluation
#missclasification error for test data sample
classification <- table(submit$Churn_actual, Churn_predicted)
classification
accuracy_Test <- sum(diag(classification )) / sum(classification)
accuracy_Test

Leave a Reply

Your email address will not be published. Required fields are marked *