Unsupervised machine learning Creating Segments using RFM data Kmeans clustering in R

#creating a fresh copy of the data to work on so that the imported original data is intact and can be reverted back easily

View(Deleted_negative_price_quantity)

dataset <- Deleted_negative_price_quantity
View(dataset)

str(dataset)
# 541909 obs. of 8 variables 
# Invoice date - Min.:2010-12-01 to Max.:2011-12-09
#InvoiceDate: POSIXct - stores both a date and time with an associated time zone. 
# The default time zone selected, is the time zone that your computer is set to which is most often your local time zone
# Details available at https://www.stat.berkeley.edu/~s133/dates.html
# We will later convert it to as.date in its simplest form

summary(dataset)
# Quantity has negative values # We need to drop these
summary(dataset)
# Unit price has negative values # We need to drop these
summary(dataset$UnitPrice)
# CustomerID has missing values # We need to drop these
table(is.na(dataset$CustomerID))

summary(dataset$UnitPrice)

# We will use dplyr package for data cleaning and manipulation

# Before we understand some functions in dplyr package, let us understand pipe operator
# %>% pipe operator part of dplyr package and magrittr package
# pipe operator implies chaining # You avoid storing results in intermediate variables
# it takes the output of one statement and makes it the input of the next statement.

# Important functions in dplyr package
#mutate() adds new variables that are functions of existing variables (transform) and preserves existing ones
#select() picks variables based on their names.
#filter() picks cases based on their values.
#summarise() reduces multiple values down to a single summary.
#arrange() changes the ordering of the rows.
# group_by

## Drop rows containing missing values
library(tidyr)
dataset <- dataset %>%
drop_na()

str(dataset)
View(dataset)
summary(dataset)
# 168631 obs. of 8 variables
# Recode variables
# character variables to factors

dataset_recode <- dataset %>% 
mutate(InvoiceNo=as.factor(InvoiceNo), StockCode = as.factor(StockCode), 
InvoiceDate=as.Date(InvoiceDate, '%m/%d/%Y %H:%M'), CustomerID=as.factor(CustomerID), 
Country=as.factor(Country))

View(dataset_recode)

summary(dataset_recode$Quantity)
summary(dataset_recode$UnitPrice)


str(dataset_recode)

View(dataset_recode)

## Calculating monetary value

dataset_recode <- dataset_recode %>% 
mutate(total_dolar = Quantity*UnitPrice)

View(dataset_recode)
# New variable created as total_dolar
# observe new variable added
#Calculate RFM
#To implement the RFM analysis, 
#a. Recency - Finding the recent transaction date for each customer id and calculating the difference between the recent transactions date and a reference date (Recency)
#b. Frequency - Grouping the data using customer ID and the calculating the number of transactions based on Invoice data to give you (FREQUENCY of transaction)
#c. Monetary Value - Taking product of (Quantity * Unitprice) = Monetary value of each transaction. Sum of all monetary value of each transaction / frequency of transaction = average monetary value of each transaction for a unique customer

# group_by takes an existing table and converts it into a grouped table where operations are performed "by group"
##

processed_data <- dataset_recode %>% 
group_by(CustomerID) %>% 
summarise(recency=as.numeric(as.Date("2012-01-01")-max(InvoiceDate)),
frequenci=n_distinct(InvoiceNo), monitery= sum(total_dolar)/n_distinct(InvoiceNo))

View(dataset)
summary(processed_data)

write.csv(processed_data, file = "processed_data.csv", row.names = FALSE)

View(processed_data)

# Recency - How recently did the customer purchase?
summary(processed_data$recency)
boxplot(processed_data$recency, col = "blue")
hist(processed_data$recency)

# Frequency – How often do they purchase?
summary(processed_data$frequenci)
boxplot(processed_data$frequenci, col = "blue")

hist(processed_data$frequenci, breaks = 50)

# Monetary Value – How much do they spend?
hist(processed_data$monitery, breaks = 50)

summary(processed_data$monitery)
boxplot(processed_data$monitery, col = "blue")

#we use log scale to normalize
processed_data$monitery <- log(processed_data$monitery)
hist(processed_data$monitery)

View(processed_data)
# K means clustering

processed_data2 <- processed_data

View(processed_data2)
row.names(processed_data2) <- processed_data2$CustomerID

## Warning: Setting row names on a tibble is deprecated.
## This means setting row names on tibble is deprecated. It still works as of now but will be removed in future.
View(processed_data2)

processed_data2$CustomerID <- NULL

## running cluster analysis ...only continous variable

processed_data2 <- scale(processed_data2)

View(processed_data2)
# scale is generic function whose default method centers and/or scales the columns of a numeric matrix.
summary(processed_data)

summary(processed_data2)

# The data given by x are clustered by the k-means method, which aims to partition the points 
# into k groups such that the sum of squares from points to the assigned cluster centres is minimized.
# The algorithm of Hartigan and Wong (1979) is used by default.

## kmeans(x, centers, iter.max = 10, nstart = 1,
#algorithm = c("Hartigan-Wong", "Lloyd", "Forgy",
"MacQueen"), trace=FALSE)
# x - numeric matrix/dataframe /vector of data 
# Hartigan algorithm - which defines the total within-cluster variation as the sum of 
# squared distances Euclidean distances between items and the corresponding centroid:
# centers - a set of initial (distinct) cluster centres
# iter.max - the maximum number of iterations allowed.
# algorithm - character: may be abbreviated. Note that "Lloyd" and "Forgy" are alternative names for one algorithm.
#nstart - we specify nstart = 25. 
#This means that R will try 25 different random starting assignments 
# and then select the best results corresponding to the one with the lowest within cluster variation. 
# The default value of nstart in R is one. 
#But, it’s strongly recommended to compute k-means clustering with a large value of nstart such as 25 or 50, in order to have a more stable result.


## implementation

Kclust <- kmeans(processed_data2, centers = 6, iter.max = 10, nstart = 25)




View(Kclust$cluster)
table(Kclust$cluster)
View(Kclust$centers)


## Analyzing output

## cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated.
## centers: A matrix of cluster centers.
## totss: The total sum of squares.
## withinss: Vector of within-cluster sum of squares, one component per cluster.
### tot.withinss: Total within-cluster sum of squares, i.e. sum(withinss).
## betweenss: The between-cluster sum of squares, i.e. $totss-tot.withinss$.
## size: The number of points in each cluster.

## Let us calculate the mean of each cluster using original data

aggregate(processed_data, list(cluster=Kclust$cluster), mean)

## Let us segment number to the data
Clustering_output <- cbind(processed_data, cluster = Kclust$cluster)
View(Clustering_output)

## Visualizing the cluster 
install.packages("factoextra")
library(factoextra)


fviz_cluster(Kclust, data = processed_data2)

## Optimal number of clusters

set.seed(123)
# method = wss is elbow method
# The total within-cluster sum of square (wss) 
fviz_nbclust(processed_data2, kmeans, method = "wss")


# method = silhoutte, the average silhouette approach measures the quality of a clustering. 
#That is, it determines how well each object lies within its cluster. A high average silhouette 
# width indicates a good clustering.
fviz_nbclust(processed_data2, kmeans, method = "silhouette")

# method = gap statistics
install.packages("cluster")
library(cluster)
set.seed(123)
gap_stat <- clusGap(processed_data2, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
library(factoextra)
fviz_gap_stat(gap_stat)


### hierarchical Clustering

library(dendextend)
library(knitr)

View(processed_data2)

processed_data2 <- processed_data
row.names(processed_data2) <- processed_data2$CustomerID

## Warning: Setting row names on a tibble is deprecated.
## This means setting row names on tibble is deprecated. It still works as of now but will be removed in future.
View(processed_data2)

processed_data2$CustomerID <- NULL

View(processed_data2)

## running cluster analysis ...only continous variable

processed_data2 <- scale(processed_data2)
# scale is generic function whose default method centers and/or scales the columns of a numeric matrix.
summary(processed_data)

summary(processed_data2)

## hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty", 
"median", "centroid", "ward.D2")

## dist() function will calculate the distance between each customers. 
# This takes dataframe as an input 
# dist() only computes the following distance measures: 
# "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski"

d <- dist(processed_data2)
d
c <- hclust(d, method = 'ward.D2')

plot(c)

members <- cutree(c,k = 8)

# let us look at the first five numbers

members[1:5]
members
table(members)
# Cluster characteristics

aggregate(processed_data[,2:4], by=list(members), mean)
Unsupervised machine learning Creating Segments using RFM data Kmeans clustering in R

Leave a Reply Cancel reply

Subscribe