## Unsupervised machine learning Creating Segments using RFM data Kmeans clustering in R

#creating a fresh copy of the data to work on so that the imported original data is intact and can be reverted back easily

View(Deleted_negative_price_quantity)

dataset <- Deleted_negative_price_quantity

View(dataset)

str(dataset)

# 541909 obs. of 8 variables

# Invoice date - Min.:2010-12-01 to Max.:2011-12-09

#InvoiceDate: POSIXct - stores both a date and time with an associated time zone.

# The default time zone selected, is the time zone that your computer is set to which is most often your local time zone

# Details available at https://www.stat.berkeley.edu/~s133/dates.html

# We will later convert it to as.date in its simplest form

summary(dataset)

# Quantity has negative values # We need to drop these

summary(dataset)

# Unit price has negative values # We need to drop these

summary(dataset$UnitPrice)

# CustomerID has missing values # We need to drop these

table(is.na(dataset$CustomerID))

summary(dataset$UnitPrice)

# We will use dplyr package for data cleaning and manipulation

# Before we understand some functions in dplyr package, let us understand pipe operator

# %>% pipe operator part of dplyr package and magrittr package

# pipe operator implies chaining # You avoid storing results in intermediate variables

# it takes the output of one statement and makes it the input of the next statement.

# Important functions in dplyr package

#mutate() adds new variables that are functions of existing variables (transform) and preserves existing ones

#select() picks variables based on their names.

#filter() picks cases based on their values.

#summarise() reduces multiple values down to a single summary.

#arrange() changes the ordering of the rows.

# group_by

## Drop rows containing missing values

library(tidyr)

dataset <- dataset %>%

drop_na()

str(dataset)

View(dataset)

summary(dataset)

# 168631 obs. of 8 variables

# Recode variables

# character variables to factors

dataset_recode <- dataset %>%

mutate(InvoiceNo=as.factor(InvoiceNo), StockCode = as.factor(StockCode),

InvoiceDate=as.Date(InvoiceDate, '%m/%d/%Y %H:%M'), CustomerID=as.factor(CustomerID),

Country=as.factor(Country))

View(dataset_recode)

summary(dataset_recode$Quantity)

summary(dataset_recode$UnitPrice)

str(dataset_recode)

View(dataset_recode)

## Calculating monetary value

dataset_recode <- dataset_recode %>%

mutate(total_dolar = Quantity*UnitPrice)

View(dataset_recode)

# New variable created as total_dolar

# observe new variable added

#Calculate RFM

#To implement the RFM analysis,

#a. Recency - Finding the recent transaction date for each customer id and calculating the difference between the recent transactions date and a reference date (Recency)

#b. Frequency - Grouping the data using customer ID and the calculating the number of transactions based on Invoice data to give you (FREQUENCY of transaction)

#c. Monetary Value - Taking product of (Quantity * Unitprice) = Monetary value of each transaction. Sum of all monetary value of each transaction / frequency of transaction = average monetary value of each transaction for a unique customer

# group_by takes an existing table and converts it into a grouped table where operations are performed "by group"

##

processed_data <- dataset_recode %>%

group_by(CustomerID) %>%

summarise(recency=as.numeric(as.Date("2012-01-01")-max(InvoiceDate)),

frequenci=n_distinct(InvoiceNo), monitery= sum(total_dolar)/n_distinct(InvoiceNo))

View(dataset)

summary(processed_data)

write.csv(processed_data, file = "processed_data.csv", row.names = FALSE)

View(processed_data)

# Recency - How recently did the customer purchase?

summary(processed_data$recency)

boxplot(processed_data$recency, col = "blue")

hist(processed_data$recency)

# Frequency – How often do they purchase?

summary(processed_data$frequenci)

boxplot(processed_data$frequenci, col = "blue")

hist(processed_data$frequenci, breaks = 50)

# Monetary Value – How much do they spend?

hist(processed_data$monitery, breaks = 50)

summary(processed_data$monitery)

boxplot(processed_data$monitery, col = "blue")

#we use log scale to normalize

processed_data$monitery <- log(processed_data$monitery)

hist(processed_data$monitery)

View(processed_data)

# K means clustering

processed_data2 <- processed_data

View(processed_data2)

row.names(processed_data2) <- processed_data2$CustomerID

## Warning: Setting row names on a tibble is deprecated.

## This means setting row names on tibble is deprecated. It still works as of now but will be removed in future.

View(processed_data2)

processed_data2$CustomerID <- NULL

## running cluster analysis ...only continous variable

processed_data2 <- scale(processed_data2)

View(processed_data2)

# scale is generic function whose default method centers and/or scales the columns of a numeric matrix.

summary(processed_data)

summary(processed_data2)

# The data given by x are clustered by the k-means method, which aims to partition the points

# into k groups such that the sum of squares from points to the assigned cluster centres is minimized.

# The algorithm of Hartigan and Wong (1979) is used by default.

## kmeans(x, centers, iter.max = 10, nstart = 1,

#algorithm = c("Hartigan-Wong", "Lloyd", "Forgy",

"MacQueen"), trace=FALSE)

# x - numeric matrix/dataframe /vector of data

# Hartigan algorithm - which defines the total within-cluster variation as the sum of

# squared distances Euclidean distances between items and the corresponding centroid:

# centers - a set of initial (distinct) cluster centres

# iter.max - the maximum number of iterations allowed.

# algorithm - character: may be abbreviated. Note that "Lloyd" and "Forgy" are alternative names for one algorithm.

#nstart - we specify nstart = 25.

#This means that R will try 25 different random starting assignments

# and then select the best results corresponding to the one with the lowest within cluster variation.

# The default value of nstart in R is one.

#But, it’s strongly recommended to compute k-means clustering with a large value of nstart such as 25 or 50, in order to have a more stable result.

## implementation

Kclust <- kmeans(processed_data2, centers = 6, iter.max = 10, nstart = 25)

View(Kclust$cluster)

table(Kclust$cluster)

View(Kclust$centers)

## Analyzing output

## cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated.

## centers: A matrix of cluster centers.

## totss: The total sum of squares.

## withinss: Vector of within-cluster sum of squares, one component per cluster.

### tot.withinss: Total within-cluster sum of squares, i.e. sum(withinss).

## betweenss: The between-cluster sum of squares, i.e. $totss-tot.withinss$.

## size: The number of points in each cluster.

## Let us calculate the mean of each cluster using original data

aggregate(processed_data, list(cluster=Kclust$cluster), mean)

## Let us segment number to the data

Clustering_output <- cbind(processed_data, cluster = Kclust$cluster)

View(Clustering_output)

## Visualizing the cluster

install.packages("factoextra")

library(factoextra)

fviz_cluster(Kclust, data = processed_data2)

## Optimal number of clusters

set.seed(123)

# method = wss is elbow method

# The total within-cluster sum of square (wss)

fviz_nbclust(processed_data2, kmeans, method = "wss")

# method = silhoutte, the average silhouette approach measures the quality of a clustering.

#That is, it determines how well each object lies within its cluster. A high average silhouette

# width indicates a good clustering.

fviz_nbclust(processed_data2, kmeans, method = "silhouette")

# method = gap statistics

install.packages("cluster")

library(cluster)

set.seed(123)

gap_stat <- clusGap(processed_data2, FUN = kmeans, nstart = 25,

K.max = 10, B = 50)

library(factoextra)

fviz_gap_stat(gap_stat)

### hierarchical Clustering

library(dendextend)

library(knitr)

View(processed_data2)

processed_data2 <- processed_data

row.names(processed_data2) <- processed_data2$CustomerID

## Warning: Setting row names on a tibble is deprecated.

## This means setting row names on tibble is deprecated. It still works as of now but will be removed in future.

View(processed_data2)

processed_data2$CustomerID <- NULL

View(processed_data2)

## running cluster analysis ...only continous variable

processed_data2 <- scale(processed_data2)

# scale is generic function whose default method centers and/or scales the columns of a numeric matrix.

summary(processed_data)

summary(processed_data2)

## hclust_methods <- c("ward.D", "single", "complete", "average", "mcquitty",

"median", "centroid", "ward.D2")

## dist() function will calculate the distance between each customers.

# This takes dataframe as an input

# dist() only computes the following distance measures:

# "euclidean", "maximum", "manhattan", "canberra", "binary" or "minkowski"

d <- dist(processed_data2)

d

c <- hclust(d, method = 'ward.D2')

plot(c)

members <- cutree(c,k = 8)

# let us look at the first five numbers

members[1:5]

members

table(members)

# Cluster characteristics

aggregate(processed_data[,2:4], by=list(members), mean)