Read the titanicTrain data and store it in train
library(magrittr)
library(ggplot2)
train <- read.csv("titanicTrain.csv")
train <- train[c(1:1000),]
# Train data 上面的NA
str(train)
## 'data.frame': 1000 obs. of 14 variables:
## $ pclass : int 1 1 1 1 1 1 1 1 1 1 ...
## $ survived : int 1 1 0 0 0 1 1 0 1 0 ...
## $ name : Factor w/ 999 levels "","Abbing, Mr. Anthony",..: 23 25 26 27 28 32 47 48 52 56 ...
## $ sex : Factor w/ 3 levels "","female","male": 2 3 2 3 2 3 2 3 2 3 ...
## $ age : num 29 0.917 2 30 25 ...
## $ sibsp : int 0 1 1 1 1 0 1 0 2 0 ...
## $ parch : int 0 2 2 2 2 0 0 0 0 0 ...
## $ ticket : Factor w/ 697 levels "","110152","110413",..: 188 51 51 51 51 125 94 17 78 617 ...
## $ fare : num 211 152 152 152 152 ...
## $ cabin : Factor w/ 181 levels "","A10","A11",..: 45 81 81 81 81 150 147 17 63 1 ...
## $ embarked : Factor w/ 4 levels "","C","Q","S": 4 4 4 4 4 4 4 4 4 2 ...
## $ boat : Factor w/ 27 levels "","1","10","11",..: 12 4 1 1 1 13 3 1 27 1 ...
## $ body : int NA NA NA 135 NA NA NA NA NA 22 ...
## $ home.dest: Factor w/ 367 levels "","?Havana, Cuba",..: 307 230 230 230 230 236 161 24 22 228 ...
sapply(train, function(x) {sum(is.na(x))})
## pclass survived name sex age sibsp parch
## 0 0 0 0 139 0 0
## ticket fare cabin embarked boat body home.dest
## 0 0 0 0 0 905 0
Pclass Variable
# Pclass
# 不同船艙等級是否與死亡有關
train$survived <- train$survived %>% as.factor()
ggplot(train[!is.na(train$survived),], aes(x = pclass, fill = survived )) +
geom_bar(stat='count', position='dodge') + labs(x = 'Train data') +
theme(legend.position="none") + theme_grey()

# 結果: 低等船艙(P3)死亡率較高
Sex Variable
## 性別是否與死亡有關
ggplot(train[!is.na(train$survived),], aes(x = sex, fill = survived)) +
geom_bar(stat='count', position='dodge') + theme_grey() +
labs(x = 'Train data') +
geom_label(stat='count', aes(label=..count..))

# 結果: 男性死亡數比較多
Pclass and Sex Variable
ggplot(train[!is.na(train$survived),], aes(x = pclass, fill = survived)) +
geom_bar(stat='count', position='stack') +
labs(x = 'Train data ', y= "Count") + facet_grid(.~sex) +
theme(legend.position="none") + theme_grey()

Embarked Variable
train$embarked <- train$embarked %>% as.factor
summary(train$embarked[train$survived==0])
## C Q S
## 0 95 50 432
summary(train$embarked[train$survived==1])
## C Q S
## 2 129 21 271
embark <- cbind(summary(train$embarked[train$survived==0]),summary(train$embarked[train$survived==1]))
embark <- embark[-1,] %>% t
rownames(embark) <- c("0","1")
barplot(embark,col=c("gray","black"),main="embarked variable",beside=TRUE,ylab="counts")
legend("topright", inset=.02,title="Survive",
c("0","1"), fill=c("gray","black"), horiz=TRUE, cex=0.8)

# 結果: 看起來從Q及S離港的人存活率較低一點
family = sibsp + parch + 1
# 一個人旅行及全家(結伴)搭船死亡數
family <- train$parch + train$sibsp + 1
train <- cbind(train,family)
ps0 <- train$family[train$survived==0] %>% as.factor %>% summary
ps1 <- train$family[train$survived==1] %>% as.factor %>% summary %>% c(.,0)
family <- rbind(ps0,ps1)
rm(ps0)
rm(ps1)
barplot(family,col=c("gray","black"),main="family",beside=TRUE,ylab="counts",xlab="Number of people")
legend("topright", inset=.02,title="Survive",
c("0","1"), fill=c("gray","black"), horiz=TRUE, cex=0.8)

# 結果: 單身漢死比較多
#load data
train <- read.csv("titanicTrain.csv", stringsAsFactors = F, na.strings = c("NA", ""))
test <- read.csv("titanicQuestion.csv", stringsAsFactors = F, na.strings = c("NA", ""))
train <- train[c(1:1000),]
#bind the data
all <- rbind(train, test)
#as factor
all$sex <- as.factor(all$sex)
all$survived <- as.factor(all$survived)
Boat Variable
#plot b1, find out that the survived condition of "every boat"
#first we rename all boats with number 1 to 28
#tranfer original boat numbers to 1:27, 28 refer to NA
all$boat[all$boat == "5 7"] <- "17"
all$boat[all$boat == "5 9"] <- "18"
all$boat[all$boat == "8 10"] <- "19"
all$boat[all$boat == "13 15"] <- "20"
all$boat[all$boat == "13 15 B"] <- "21"
all$boat[all$boat == "15 16"] <- "22"
all$boat[all$boat == "A"] <- "23"
all$boat[all$boat == "B"] <- "24"
all$boat[all$boat == "C"] <- "25"
all$boat[all$boat == "D"] <- "26"
all$boat[all$boat == "C D"] <- "27"
all$boat[is.na(all$boat)] <- "28"
#escape = 1 refer to the refugee who successfully took on boat
all$escape[all$boat != "28"] <- "1"
all$escape[all$boat == "28"] <- "0"
#see if taking boat is highly related survived
b1 <- ggplot(all[!is.na(all$survived),], aes(x = escape, fill = survived)) +
geom_bar(stat='count', position='dodge') + theme_grey() +
labs(x = 'escape from ship') +
geom_label(stat='count', aes(label=..count..))
b1

# 結果: 搭上船的大多都會活
#plot b2, see if there's anyone who survived without boat
#subset of those who took on boat
survived_boat <- subset(all, all$boat != "28")
#see if there anyone who took on boat and didn't survive
b2 <- ggplot(survived_boat[!is.na(survived_boat$survived),], aes(x = boat, fill = survived)) +
geom_bar(stat='count', position='dodge') +
labs(x = 'boat_survive') + theme_grey()
b2

Body variable
# 有屍體紀錄
#replace those data to "0"(body info is NA) and "1"(body info isn't NA)
all$body[is.na(all$body)] <- "0"
all$body[all$body != "0"] <- "1"
all$body <- as.factor(all$body)
#=== guessing there is something to do with dead people
#set a subset of those which get the body's value
have_body <- subset(all, all$body != "0")
#find out if those data with body info are highly related to death
bo <- ggplot(have_body[!is.na(have_body$survived),], aes(x = body, fill = survived)) +
geom_bar(stat='count', position='dodge') +
labs(x = 'body_info') + theme_grey()
bo

## 有body的人全死!
#=== there isn't any survivor has body data
#=== regarding from bo, it seems those who have body info all died
#so we want to find out the distribution of survived and body
bo_2 <-ggplot(all[!is.na(all$survived),], aes(x = body, fill = survived)) +
geom_bar(stat='count', position='dodge') + theme_grey() +
labs(x = 'body with death') +
geom_label(stat='count', aes(label=..count..))
bo_2

#=== the outcomes of these 2 plots reveal that those who got body value was definitely dead