Predicting survivors on titanic ship using machine learning

Case description

Problem – Predicting survivors on titanic ship using machine learning.

This case study is part of the Kaggle competition. You can make the submission of your prediction on Kaggle.

This data is also available at https://www.kaggle.com/c/titanic/data

Data / Variables description

VariableDescriptionCategory
survivalSurvival0 = No, 1 = Yes
pclassTicket class1 = 1st, 2 = 2nd, 3 = 3rd
sexSex 
AgeAge in years 
sibsp# of siblings / spouses aboard the Titanic 
parch# of parents / children aboard the Titanic 
ticketTicket number 
farePassenger fare 
cabinCabin number 
embarkedPort of EmbarkationC = Cherbourg, Q = Queenstown, S = Southampton

Download dataset to practice

Train

Test

Solution with R codes

1. Download and save the above dataset (Train and Test) on your desktop.

2. Create a project in RStudio

If you are new to RStudio, learn how to create project in RStudio

http://datasciencevidhya.com/post/how-to-create-a-project-in-r-studio

3. Link the saved dataset and set up the working directory

R codes

Data Exploration

#Reading the CSV file or loading the raw data

train <- read.csv("train.csv", header=TRUE)

test <- read.csv("test.csv", header=TRUE)


# Print train and test to the console
train
test

View(train)
View(test)

head(train)
head(test)

# Exploring the data types in the data set 
str(train)
str(test)


Missing value analysis



is.na(train$Pclass) # this command returns TRUE if there is a missing value otherwise false
# storing these value in a dataframe

table(is.na(train$Pclass))
missing_Pclass <- data.frame (is.na(train$Pclass))
View(missing_Pclass)
table(missing_Pclass) # no missing values 

# Do this for other variable as well
missing_train_Sex <- data.frame(is.na(train$Sex))
missing_train_Age <- data.frame(is.na(train$Age))
missing_train_Sibsp <- data.frame(is.na(train$SibSp))
missing_train_Parch <- data.frame(is.na(train$Parch))
missing_train_Ticket <- data.frame(is.na(train$Ticket))
missing_train_Fare <- data.frame(is.na(train$Fare))
missing_train_Cabin <- data.frame(is.na(train$Cabin))
missing_train_Embarked <- data.frame(is.na(train$Embarked))

#Tabulate missing values

table(missing_train_Sex)
table(missing_train_Age)
table(missing_train_Sibsp)
table(missing_train_Parch) 
table(missing_train_Ticket) 
table(missing_train_Fare)
table(missing_train_Cabin) 
table(missing_train_Embarked)

table(train$Embarked)

train$Embarked

#For test data

missing_test_Sex <- data.frame(is.na(test$Sex))
missing_test_Age <- data.frame(is.na(test$Age))
missing_test_Sibsp <- data.frame(is.na(test$SibSp))
missing_test_Parch <- data.frame(is.na(test$Parch))
missing_test_Ticket <- data.frame(is.na(test$Ticket))
missing_test_Fare <- data.frame(is.na(test$Fare))
missing_test_Cabin <- data.frame(is.na(test$Cabin))
missing_test_Embarked <- data.frame(is.na(test$Embarked))

#Tabulate missing values

table(missing_test_Sex)
table(missing_test_Age)
table(missing_test_Sibsp)
table(missing_test_Parch) 
table(missing_test_Ticket) 
table(missing_test_Fare)
table(missing_test_Cabin) 
table(missing_test_Embarked)

table(is.na(train$Pclass))
table(is.na(train$Sex))
table(is.na(train$Age))
table(is.na(train$SibSp))
table(is.na(train$Parch))
table(is.na(train$Ticket))
table(is.na(train$Fare))
table(is.na(train$Cabin))
table(is.na(train$Embarked))

Exploratroy data analysis
 

Frequency tables and descriptive statistics

 

# converting Pclass into factor
train$Pclass <- as.factor(train$Pclass)
table(train$Pclass)


train$Sex <- as.factor(train$Sex)
table(train$Sex)

train$SibSp <- as.factor(train$SibSp)
table(train$SibSp)

train$Parch <- as.factor(train$Parch)
table(train$Parch)

train$Embarked <- as.factor(train$Embarked)
table(train$Embarked)

train$Survived <- as.factor(train$Survived)
table(train$Survived)

#test data
test$Pclass <- as.factor(test$Pclass)
table(test$Pclass)

test$Sex <- as.factor(test$Sex)
table(test$Sex)

test$SibSp <- as.factor(test$SibSp)
table(test$SibSp)

test$Parch <- as.factor(test$Parch)
table(test$Parch)

test$Embarked <- as.factor(test$Embarked)
table(test$Embarked)

           
# descriptive statistics for continous variable (train data)
summary(train$Age)
sd(train$Age, na.rm=TRUE)
mean(train$Age, na.rm=TRUE)


# descriptive statistics for continous variable(test data)
summary(test$Age)
sd(test$Age, na.rm=TRUE)
mean(test$Age, na.rm=TRUE)

# Replacing the missing values with mean
train$Age[is.na(train$Age)] <- mean (train$Age, na.rm=TRUE)

# Replacing the missing values with mean
test$Age[is.na(test$Age)] <- mean (test$Age, na.rm=TRUE)

## Add survived column for prediction


test$Survived <- NA
test$Survived <- factor(test$Survived, levels = c("1", "0") )
levels(test$Survived)

levels(train$Survived) <- levels(test$Survived)

# Survival rates in absolute numbers
table(train$Survived)

# As proportions
prop.table(table(train$Survived))

 

Bivariate relationships



# Two-way comparison: Sex and Survived
table(train$Sex, train$Survived)

# Two-way comparison: row-wise proportions

prop.table(table(train$Sex, train$Survived))

Make prediction using Zero classification model

Either everyone survives or everyone dies

# Prediction 1 - everyone dies


everyone_dies <- test
everyone_dies$Survived <- 0

View(everyone_dies)

#submit a csv file with the PassengerId as well as our Survived predictions to Kaggle
submit <- data.frame(PassengerId = everyone_dies$PassengerId, Survived = everyone_dies$Survived)

write.csv(submit, file = "everyone_dies.csv", row.names = FALSE)

# Prediction 2 - Everyone survives

everyonesurvives <-test
everyonesurvives$Survived <- 1
View(everyonesurvives)

#submit a csv file with the PassengerId as well as our Survived predictions to Kaggle
submit <- data.frame(PassengerId = everyonesurvives$PassengerId, Survived = everyonesurvives$Survived)

write.csv(submit, file = "everyone_survives.csv", row.names = FALSE)

# Prediction 3 -  All female survived / Was women and children first rule followed?
# Let us try to understand the relationship between survival and gender by doing cross tabulation
# Cross tabs

install.packages("gmodels")
library(gmodels)
CrossTable (train$Survived, train$Sex)


Allfemalesurvived <- test

# Initialize a Survived column to 0
Allfemalesurvived$Survived <- 0

# Set Survived to 1 if Sex equals "female"
Allfemalesurvived$Survived[Allfemalesurvived$Sex== "female"]  <-1

#import the .CSV file

submit <- data.frame(PassengerId = Allfemalesurvived$PassengerId, Survived = Allfemalesurvived$Survived)

write.csv(submit, file = "femalessurvived.csv", row.names = FALSE)

Supervised Machine learning Algorithms for predictive modeling.

Logistic regression


Train the model using the training sets and check score

logistic <- glm(formula = Survived ~ Pclass + Sex + Embarked + Age, data = train,  family ="binomial")

summary(logistic)

# Let us make prediction using logistic model

my_log_prediction <- predict(logistic, test, type = "response")

# Suppose you wish to calculate probability

submit <- data.frame(PassengerId = test$PassengerId, Survived = my_log_prediction)

write.csv(submit, file = "logistic_prob.csv", row.names = FALSE)

# Suppose you wish to calculate classifcation

predicted.classes <- ifelse(predicted > 0.5, "1", "0")
table(predicted.classes)

submit <- data.frame(PassengerId = test$PassengerId, Survived = predicted.classes)
write.csv(submit, file = "logistic_class.csv", row.names = FALSE)

submit <- data.frame(PassengerId = test$PassengerId, Survived = my_log_prediction, Classification = predicted.classes)
write.csv(submit, file = "logistic_class_prob.csv", row.names = FALSE)

Decision Trees

 

CART decision tree algorithm
library(rpart)

install.packages('rattle')
install.packages('rpart.plot')
install.packages('RColorBrewer')
library(rattle)
library(rpart.plot)

fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Embarked, data=train, method="class")
# fit is our decision tree model 
# let us visualize the tree

plot(fit)
text(fit)

fancyRpartPlot(fit)

# Let us make the prediction using decision tree model

Prediction <- predict(fit, test, type = "class")
submit <- data.frame(PassengerId = test$PassengerId, Survived = Prediction)
write.csv(submit, file = "myfirstdtree.csv", row.names = FALSE)

Leave a Reply

Your email address will not be published.