df %>%
filter(!is.na(Category)) %>%
group_by(Category) %>%
summarise(n = n(), pc = round(n / 4.54, 1))
df %>%
filter(!is.na(Category)) %>%
ggplot(., aes(x=Category, fill=Category)) + geom_bar()
scatterplotMatrix(~ Age + Total_wait + cRF + Days_from_offer, data = df.tx)
The data is split into training and validation sets on a 70/30 ratio.
Naive Bayes Classifier for Discrete Predictors
Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)
A-priori probabilities:
Y
0 1
0.6705202 0.3294798
Conditional probabilities:
Age.cat
Y [18,30) [30,40) [40,50) [50,60) [60,85)
0 0.05172414 0.12931034 0.09482759 0.43965517 0.28448276
1 0.07017544 0.19298246 0.21052632 0.29824561 0.22807018
Blood_group
Y A AB B null O
0 0.34482759 0.05172414 0.09482759 0.00862069 0.50000000
1 0.52631579 0.08771930 0.03508772 0.00000000 0.35087719
cRF.cat
Y [0,10) [10,40) [40,70) [70,85) [85,95) [95,100)
0 0.68965517 0.10344828 0.05172414 0.04310345 0.01724138 0.09482759
1 0.82456140 0.05263158 0.01754386 0.07017544 0.00000000 0.03508772
TotWt
Y [0,365) [365,730) [730,1.1e+03) [1.1e+03,1.46e+03) [1.46e+03,1.82e+03)
0 0.37962963 0.24074074 0.26851852 0.04629630 0.06481481
1 0.33333333 0.31481481 0.09259259 0.18518519 0.07407407
ReactWt
Y [0,365) [365,730) [730,1.1e+03) [1.1e+03,1.46e+03) [1.46e+03,1.82e+03)
0 0.53571429 0.18750000 0.20535714 0.04464286 0.02678571
1 0.56140351 0.29824561 0.07017544 0.03508772 0.03508772
NB_Predictions5a = predict(nbm5, train)
conf.matrix5a <- table(NB_Predictions5a, train$Tx_100d)
conf.matrix5a
NB_Predictions5a 0 1
0 96 24
1 20 33
confusionMatrix(NB_Predictions5a, train$Tx_100d)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 96 24
1 20 33
Accuracy : 0.7457
95% CI : (0.674, 0.8087)
No Information Rate : 0.6705
P-Value [Acc > NIR] : 0.01992
Kappa : 0.4139
Mcnemar's Test P-Value : 0.65108
Sensitivity : 0.8276
Specificity : 0.5789
Pos Pred Value : 0.8000
Neg Pred Value : 0.6226
Prevalence : 0.6705
Detection Rate : 0.5549
Detection Prevalence : 0.6936
Balanced Accuracy : 0.7033
'Positive' Class : 0
NB_Predictions5b = predict(nbm5, test)
conf.matrix5b <- table(NB_Predictions5b, test$Tx_100d)
conf.matrix5b
NB_Predictions5b 0 1
0 36 12
1 13 12
confusionMatrix(NB_Predictions5b, test$Tx_100d)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 36 12
1 13 12
Accuracy : 0.6575
95% CI : (0.5372, 0.7647)
No Information Rate : 0.6712
P-Value [Acc > NIR] : 0.6501
Kappa : 0.2322
Mcnemar's Test P-Value : 1.0000
Sensitivity : 0.7347
Specificity : 0.5000
Pos Pred Value : 0.7500
Neg Pred Value : 0.4800
Prevalence : 0.6712
Detection Rate : 0.4932
Detection Prevalence : 0.6575
Balanced Accuracy : 0.6173
'Positive' Class : 0
The data is again split into validation and training sets
trainIndex6 = createDataPartition(df.b$Tx_6m,
p=0.7, list=FALSE,times=1)
train6 = df.b[trainIndex,]
test6 = df.b[-trainIndex,]
nbm.6m = naiveBayes(Tx_6m ~ Age.cat + Blood_group + cRF.cat + TotWt + ReactWt, data = train6)
nbm.6m
Naive Bayes Classifier for Discrete Predictors
Call:
naiveBayes.default(x = X, y = Y, laplace = laplace)
A-priori probabilities:
Y
0 1
0.5549133 0.4450867
Conditional probabilities:
Age.cat
Y [18,30) [30,40) [40,50) [50,60) [60,85)
0 0.05208333 0.11458333 0.10416667 0.45833333 0.27083333
1 0.06493506 0.19480519 0.16883117 0.31168831 0.25974026
Blood_group
Y A AB B null O
0 0.32291667 0.05208333 0.09375000 0.01041667 0.52083333
1 0.50649351 0.07792208 0.05194805 0.00000000 0.36363636
cRF.cat
Y [0,10) [10,40) [40,70) [70,85) [85,95) [95,100)
0 0.67708333 0.08333333 0.06250000 0.05208333 0.02083333 0.10416667
1 0.80519481 0.09090909 0.01298701 0.05194805 0.00000000 0.03896104
TotWt
Y [0,365) [365,730) [730,1.1e+03) [1.1e+03,1.46e+03) [1.46e+03,1.82e+03)
0 0.41573034 0.24719101 0.25842697 0.03370787 0.04494382
1 0.30136986 0.28767123 0.15068493 0.16438356 0.09589041
ReactWt
Y [0,365) [365,730) [730,1.1e+03) [1.1e+03,1.46e+03) [1.46e+03,1.82e+03)
0 0.54347826 0.18478261 0.18478261 0.05434783 0.03260870
1 0.54545455 0.27272727 0.12987013 0.02597403 0.02597403
NB_Predictions6m = predict(nbm.6m, train6)
conf.matrix6m <- table(NB_Predictions6m, train6$Tx_6m)
conf.matrix6m
NB_Predictions6m 0 1
0 68 25
1 28 52
chisq.test(conf.matrix6m)
Pearson's Chi-squared test with Yates' continuity correction
data: conf.matrix6m
X-squared = 23.78, df = 1, p-value = 1.08e-06
confusionMatrix(NB_Predictions6m, train6$Tx_6m)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 68 25
1 28 52
Accuracy : 0.6936
95% CI : (0.6192, 0.7614)
No Information Rate : 0.5549
P-Value [Acc > NIR] : 0.00013
Kappa : 0.3822
Mcnemar's Test P-Value : 0.78353
Sensitivity : 0.7083
Specificity : 0.6753
Pos Pred Value : 0.7312
Neg Pred Value : 0.6500
Prevalence : 0.5549
Detection Rate : 0.3931
Detection Prevalence : 0.5376
Balanced Accuracy : 0.6918
'Positive' Class : 0
NB_Predictions6t = predict(nbm.6m, test6)
conf.matrix6t <- table(NB_Predictions6t, test6$Tx_6m)
conf.matrix6t
NB_Predictions6t 0 1
0 20 12
1 16 25
chisq.test(conf.matrix6t)
Pearson's Chi-squared test with Yates' continuity correction
data: conf.matrix6t
X-squared = 3.0791, df = 1, p-value = 0.0793
confusionMatrix(NB_Predictions6t, test6$Tx_6m)
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 20 12
1 16 25
Accuracy : 0.6164
95% CI : (0.4952, 0.7279)
No Information Rate : 0.5068
P-Value [Acc > NIR] : 0.03911
Kappa : 0.2316
Mcnemar's Test P-Value : 0.57075
Sensitivity : 0.5556
Specificity : 0.6757
Pos Pred Value : 0.6250
Neg Pred Value : 0.6098
Prevalence : 0.4932
Detection Rate : 0.2740
Detection Prevalence : 0.4384
Balanced Accuracy : 0.6156
'Positive' Class : 0