- import the data
data(Titanic)
tr(Titanic)
# 'table' num [1:4, 1:2, 1:2, 1:2] 0 0 35 0 0 0 17 0 118 154 ...
# - attr(*, "dimnames")=List of 4
# ..$ Class : chr [1:4] "1st" "2nd" "3rd" "Crew"
# ..$ Sex : chr [1:2] "Male" "Female"
# ..$ Age : chr [1:2] "Child" "Adult"
# ..$ Survived: chr [1:2] "No" "Yes"
- convert the array into a data frame
countsToCases <- function(x, countcol = "Freq") {
# Get the row indices to pull from x
idx <- rep.int(seq_len(nrow(x)), x[[countcol]])
# Drop count column
x[[countcol]] <- NULL
# Get the rows from x
x[idx, ]
}
caseTita<-countsToCases(as.data.frame(Titanic))
head(caseTita)
# Class Sex Age Survived
# 3 3rd Male Child No
# 3.1 3rd Male Child No
# 3.2 3rd Male Child No
# 3.3 3rd Male Child No
# 3.4 3rd Male Child No
# 3.5 3rd Male Child No
nrow(caseTita)
# [1] 2201
- Naïve Bayes classification
library(e1071)
model <- naiveBayes(Survived ~ ., data = caseTita)
predict(model, caseTita[sample(1:2201,10,replace=FALSE),])
# [1] No No No No No No Yes No Yes No
# Levels: No Yes
predict(model, caseTita[sample(1:2201,10,replace=FALSE),],type="raw")
# No Yes
# [1,] 0.7247820 0.2752180
# [2,] 0.6960593 0.3039407
# [3,] 0.8466171 0.1533829
# [4,] 0.3679509 0.6320491
# [5,] 0.8466171 0.1533829
# [6,] 0.7247820 0.2752180
# [7,] 0.8466171 0.1533829
# [8,] 0.3523184 0.6476816
# [9,] 0.8552217 0.1447783
# [10,] 0.8466171 0.1533829
m <- naiveBayes(Survived ~ ., data = Titanic)
m
# Naive Bayes Classifier for Discrete Predictors
#
# Call:
# naiveBayes.formula(formula = Survived ~ ., data = Titanic)
#
# A-priori probabilities:
# Survived
# No Yes
# 0.676965 0.323035
#
# Conditional probabilities:
# Class
# Survived 1st 2nd 3rd Crew
# No 0.08187919 0.11208054 0.35436242 0.45167785
# Yes 0.28551336 0.16596343 0.25035162 0.29817159
#
# Sex
# Survived Male Female
# No 0.91543624 0.08456376
# Yes 0.51617440 0.48382560
#
# Age
# Survived Child Adult
# No 0.03489933 0.96510067
# Yes 0.08016878 0.91983122
- split the data into the predictor data frame and outcome vector
library(caret)
x<-caseTita[,-4]
y<-caseTita$Survived
model1 <- train(x,y,'nb',trControl=trainControl(method='cv',number=10))
model1
# Naive Bayes
#
# 2201 samples
# 3 predictor
# 2 classes: 'No', 'Yes'
#
# No pre-processing
# Resampling: Cross-Validated (10 fold)
# Summary of sample sizes: 1981, 1981, 1981, 1981, 1981, 1981, ...
# Resampling results across tuning parameters:
#
# usekernel Accuracy Kappa
# FALSE 0.7791814 0.4474594
# TRUE 0.7791814 0.4474594
#
# Tuning parameter 'fL' was held constant at a value of 0
# Tuning parameter 'adjust' was held
# constant at a value of 1
# Accuracy was used to select the optimal model using the largest value.
# The final values used for the model were fL = 0, usekernel = FALSE and adjust = 1.
- predict the outcome
predict(model1$finalModel,caseTita[sample(1:2201,10,replace=FALSE),])$class
# 27.63 12.225 30.44 12.630 11.24 15.38 9.76 31.15 10.150 10.53
# No No Yes No No Yes No Yes No No
# Levels: No Yes
table(predict(model1$finalModel,x)$class,y)
# y
# No Yes
# No 1364 362
# Yes 126 349
Reference:
Zhang Zhongheng Naïve Bayes classification in R