Example and Summary of Classifiers with Spam Email Data in R

The increasing volume of unsolicited bulk e-mail (also known as spam) has generated a need for reliable anti-spam filters. Machine learning techniques are frequently employed to automatically filter those spam e-mails quite successfully (to some degree). 

The dataset is from the UCI dataset repository https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/)

We need to build binary classifiers to classify emails as either normal (non-spam) or spam and take a look at their accuracy, as a summary.


library(ElemStatLearn) # Fundamental essence of stat
library(tree) # Classify Tree...
library(maptree) # Another Tree...

## Loading required package: cluster

## Loading required package: rpart

library(rpart)  # Recursive Partitioning and Regression Trees (RPart)
library(e1071)  # Support Vector Machine (SVM)
library(FNN)
library(C50)

spambased<-read.csv("spambase.csv",header=T,sep=",")
str(spambased)

## 'data.frame':    4601 obs. of  58 variables:
##  $ w01 : num  0 0.21 0.06 0 0 0 0 0 0.15 0.06 ...
##  $ w02 : num  0.64 0.28 0 0 0 0 0 0 0 0.12 ...
##  $ w03 : num  0.64 0.5 0.71 0 0 0 0 0 0.46 0.77 ...
##  $ w04 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w05 : num  0.32 0.14 1.23 0.63 0.63 1.85 1.92 1.88 0.61 0.19 ...
##  $ w06 : num  0 0.28 0.19 0 0 0 0 0 0 0.32 ...
##  $ w07 : num  0 0.21 0.19 0.31 0.31 0 0 0 0.3 0.38 ...
##  $ w08 : num  0 0.07 0.12 0.63 0.63 1.85 0 1.88 0 0 ...
##  $ w09 : num  0 0 0.64 0.31 0.31 0 0 0 0.92 0.06 ...
##  $ w10 : num  0 0.94 0.25 0.63 0.63 0 0.64 0 0.76 0 ...
##  $ w11 : num  0 0.21 0.38 0.31 0.31 0 0.96 0 0.76 0 ...
##  $ w12 : num  0.64 0.79 0.45 0.31 0.31 0 1.28 0 0.92 0.64 ...
##  $ w13 : num  0 0.65 0.12 0.31 0.31 0 0 0 0 0.25 ...
##  $ w14 : num  0 0.21 0 0 0 0 0 0 0 0 ...
##  $ w15 : num  0 0.14 1.75 0 0 0 0 0 0 0.12 ...
##  $ w16 : num  0.32 0.14 0.06 0.31 0.31 0 0.96 0 0 0 ...
##  $ w17 : num  0 0.07 0.06 0 0 0 0 0 0 0 ...
##  $ w18 : num  1.29 0.28 1.03 0 0 0 0.32 0 0.15 0.12 ...
##  $ w19 : num  1.93 3.47 1.36 3.18 3.18 0 3.85 0 1.23 1.67 ...
##  $ w20 : num  0 0 0.32 0 0 0 0 0 3.53 0.06 ...
##  $ w21 : num  0.96 1.59 0.51 0.31 0.31 0 0.64 0 2 0.71 ...
##  $ w22 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w23 : num  0 0.43 1.16 0 0 0 0 0 0 0.19 ...
##  $ w24 : num  0 0.43 0.06 0 0 0 0 0 0.15 0 ...
##  $ w25 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w26 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w27 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w28 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w29 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w30 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w31 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w32 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w33 : num  0 0 0 0 0 0 0 0 0.15 0 ...
##  $ w34 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w35 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w36 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w37 : num  0 0.07 0 0 0 0 0 0 0 0 ...
##  $ w38 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w39 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w40 : num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ w41 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w42 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w43 : num  0 0 0.12 0 0 0 0 0 0.3 0 ...
##  $ w44 : num  0 0 0 0 0 0 0 0 0 0.06 ...
##  $ w45 : num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ w46 : num  0 0 0.06 0 0 0 0 0 0 0 ...
##  $ w47 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ w48 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ c1  : num  0 0 0.01 0 0 0 0 0 0 0.04 ...
##  $ c2  : num  0 0.132 0.143 0.137 0.135 0.223 0.054 0.206 0.271 0.03 ...
##  $ c3  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ c4  : num  0.778 0.372 0.276 0.137 0.135 0 0.164 0 0.181 0.244 ...
##  $ c5  : num  0 0.18 0.184 0 0 0 0.054 0 0.203 0.081 ...
##  $ c6  : num  0 0.048 0.01 0 0 0 0 0 0.022 0 ...
##  $ ca  : num  3.76 5.11 9.82 3.54 3.54 ...
##  $ cl  : int  61 101 485 40 40 15 4 11 445 43 ...
##  $ ct  : int  278 1028 2259 191 191 54 112 49 1257 749 ...
##  $ Spam: int  1 1 1 1 1 1 1 1 1 1 ...

order<-sample(4601,3800)
spambased$Spam<-as.factor(spambased$Spam)
spam_train<-spambased[order,]
spam_test<-spambased[-order,]
Spam_test_label<-spam_test[,58]

#C50
pt<-proc.time()
Spam_c50<-C5.0(x=spam_train[,-58],y=spam_train$Spam,data=spam_train)
proc.time()-pt

##    user  system elapsed
##   0.516   0.007   0.524

Spamc50_predict_results<-predict(Spam_c50,newdata=spam_test)
table(Spamc50_predict_results,Spam_test_label)

##                        Spam_test_label
## Spamc50_predict_results   0   1
##                       0 457  19
##                       1  31 294

error_c50<-sum(Spam_test_label!=Spamc50_predict_results)/nrow(spam_test)
print(paste0("Accuary(Precision) of C5.0: ",1-error_c50))

## [1] "Accuary(Precision) of C5.0: 0.937578027465668"

#svm
pt<-proc.time()
Spam_svm<-svm(formula=spam_train$Spam~.,data=spam_train, method="class",kernel=
"radial")
proc.time()-pt

##    user  system elapsed
##   1.720   0.027   1.751

Spamsvm_predict_results<-predict(Spam_svm,newdata=spam_test,type="class")
table(Spamsvm_predict_results,Spam_test_label)

##                        Spam_test_label
## Spamsvm_predict_results   0   1
##                       0 466  35
##                       1  22 278

error_svm<-sum(Spam_test_label!=Spamsvm_predict_results)/nrow(spam_test)
print(paste0("Accuary(Precision) of SVM: ",1-error_svm))

## [1] "Accuary(Precision) of SVM: 0.928838951310861"

#knn
Spam_train_x<-spam_train[,-58]
Spam_test_x<-spam_test[,-58]
Spam_train_label<-spam_train[,58]
pt<-proc.time()# check time consumed
Spam_knn<-knn(Spam_train_x,Spam_test_x,Spam_train_label,k=1)
proc.time()-pt

##    user  system elapsed
##   0.092   0.004   0.098

table(Spam_knn,Spam_test_label)

##         Spam_test_label
## Spam_knn   0   1
##        0 401  61
##        1  87 252

error_knn<-sum(Spam_test_label!=Spam_knn)/nrow(spam_test)
print(paste0("Accuary(Precision) of KNN: ",1-error_knn))

## [1] "Accuary(Precision) of KNN: 0.815230961298377"

#Rpart(classification tree)
pt<-proc.time()
Spam_rpart<-rpart(formula=spam_train$Spam~.,data=spam_train, method="class")
proc.time()-pt

##    user  system elapsed
##   0.461   0.006   0.468

Spamrpart_predict_results<-predict(Spam_rpart,newdata=spam_test,type="class")
table(Spamrpart_predict_results,Spam_test_label)

##                          Spam_test_label
## Spamrpart_predict_results   0   1
##                         0 461  57
##                         1  27 256

error_rpart<-sum(Spam_test_label!=Spamrpart_predict_results)/nrow(spam_test)
print(paste0("Accuary(Precision) of rpart: ",1-error_rpart))

## [1] "Accuary(Precision) of rpart: 0.895131086142322"

#Naive Bayes
pt <- proc.time()
Spam_nbs <- naiveBayes(spam_train$Spam ~ ., data = spam_train)
proc.time() - pt

##    user  system elapsed
##   0.024   0.002   0.025

Spamnbs_predict_results<-predict(Spam_nbs,newdata=spam_test,type="class")
table(Spamnbs_predict_results,Spam_test_label)

##                        Spam_test_label
## Spamnbs_predict_results   0   1
##                       0 269  11
##                       1 219 302

error_nbs<-sum(Spam_test_label!=Spamnbs_predict_results)/nrow(spam_test)
print(paste0("Accuary(Precision) of Naive Bayes: ",1-error_nbs))

## [1] "Accuary(Precision) of Naive Bayes: 0.712858926342072"

#Neural Network
library(nnet)
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(pROC)

## Type 'citation("pROC")' for a citation.

##
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
##
##     cov, smooth, var

levels <- unique(spam_train[,58])
spam_train[,58] <- factor(spam_train[,58], labels=make.names(levels))
pt <- proc.time()
numFolds <- trainControl(method = 'cv', number = 10, classProbs = TRUE,
verboseIter = TRUE, summaryFunction = twoClassSummary,
preProcOptions = list(thresh = 0.75, ICAcomp = 3, k = 5))
Spam_nn<- train(x=spam_train[,-58],y=spam_train[,58], method = "nnet", preProcess =
c('center', 'scale'), trControl = numFolds,
tuneGrid=expand.grid(size=c(10), decay=c(0.1)))

## Warning in train.default(x = spam_train[, -58], y = spam_train[, 58],
## method = "nnet", : The metric "Accuracy" was not in the result set. ROC
## will be used instead.

## + Fold01: size=10, decay=0.1
## # weights:  591
## initial  value 2965.373442
## iter  10 value 591.820514
## iter  20 value 443.063415
## iter  30 value 388.494456
## iter  40 value 350.636843
## iter  50 value 321.464267
## iter  60 value 306.122219
## iter  70 value 281.704723
## iter  80 value 269.522967
## iter  90 value 256.984699
## iter 100 value 248.846096
## final  value 248.846096
## stopped after 100 iterations
## - Fold01: size=10, decay=0.1
## + Fold02: size=10, decay=0.1
## # weights:  591
## initial  value 2498.941644
## iter  10 value 750.146355
## iter  20 value 561.424123
## iter  30 value 458.262698
## iter  40 value 397.966672
## iter  50 value 352.925491
## iter  60 value 323.061674
## iter  70 value 303.280244
## iter  80 value 288.742827
## iter  90 value 276.339896
## iter 100 value 268.805654
## final  value 268.805654
## stopped after 100 iterations
## - Fold02: size=10, decay=0.1
## + Fold03: size=10, decay=0.1
## # weights:  591
## initial  value 3201.051637
## iter  10 value 674.014821
## iter  20 value 505.342951
## iter  30 value 410.698562
## iter  40 value 351.328818
## iter  50 value 318.688547
## iter  60 value 294.321328
## iter  70 value 274.351943
## iter  80 value 265.034292
## iter  90 value 259.585714
## iter 100 value 255.273016
## final  value 255.273016
## stopped after 100 iterations
## - Fold03: size=10, decay=0.1
## + Fold04: size=10, decay=0.1
## # weights:  591
## initial  value 2462.868947
## iter  10 value 717.415855
## iter  20 value 493.795771
## iter  30 value 412.068577
## iter  40 value 371.529178
## iter  50 value 336.614681
## iter  60 value 305.292466
## iter  70 value 287.302471
## iter  80 value 274.419110
## iter  90 value 267.877469
## iter 100 value 263.419744
## final  value 263.419744
## stopped after 100 iterations
## - Fold04: size=10, decay=0.1
## + Fold05: size=10, decay=0.1
## # weights:  591
## initial  value 2949.190641
## iter  10 value 681.113442
## iter  20 value 490.416399
## iter  30 value 418.097669
## iter  40 value 361.806455
## iter  50 value 322.763168
## iter  60 value 297.779493
## iter  70 value 282.783662
## iter  80 value 273.268818
## iter  90 value 266.772008
## iter 100 value 262.067830
## final  value 262.067830
## stopped after 100 iterations
## - Fold05: size=10, decay=0.1
## + Fold06: size=10, decay=0.1
## # weights:  591
## initial  value 2335.791972
## iter  10 value 616.200011
## iter  20 value 457.880457
## iter  30 value 394.251750
## iter  40 value 341.546870
## iter  50 value 310.695766
## iter  60 value 289.495196
## iter  70 value 272.827716
## iter  80 value 260.965995
## iter  90 value 253.446501
## iter 100 value 247.608625
## final  value 247.608625
## stopped after 100 iterations
## - Fold06: size=10, decay=0.1
## + Fold07: size=10, decay=0.1
## # weights:  591
## initial  value 2187.012474
## iter  10 value 602.928859
## iter  20 value 493.641818
## iter  30 value 426.477344
## iter  40 value 373.564396
## iter  50 value 338.773198
## iter  60 value 310.132787
## iter  70 value 290.709443
## iter  80 value 279.423736
## iter  90 value 270.723099
## iter 100 value 263.532781
## final  value 263.532781
## stopped after 100 iterations
## - Fold07: size=10, decay=0.1
## + Fold08: size=10, decay=0.1
## # weights:  591
## initial  value 2756.337927
## iter  10 value 598.949962
## iter  20 value 460.854701
## iter  30 value 405.594546
## iter  40 value 346.182812
## iter  50 value 308.546178
## iter  60 value 284.595848
## iter  70 value 270.773372
## iter  80 value 263.086711
## iter  90 value 258.178841
## iter 100 value 254.316005
## final  value 254.316005
## stopped after 100 iterations
## - Fold08: size=10, decay=0.1
## + Fold09: size=10, decay=0.1
## # weights:  591
## initial  value 2661.814209
## iter  10 value 747.100895
## iter  20 value 590.390424
## iter  30 value 492.309891
## iter  40 value 425.077636
## iter  50 value 376.424054
## iter  60 value 346.554210
## iter  70 value 330.933599
## iter  80 value 318.177736
## iter  90 value 307.654434
## iter 100 value 300.157972
## final  value 300.157972
## stopped after 100 iterations
## - Fold09: size=10, decay=0.1
## + Fold10: size=10, decay=0.1
## # weights:  591
## initial  value 2199.353448
## iter  10 value 663.618264
## iter  20 value 523.426028
## iter  30 value 461.922356
## iter  40 value 404.069524
## iter  50 value 366.172260
## iter  60 value 323.969924
## iter  70 value 295.581604
## iter  80 value 282.873027
## iter  90 value 274.971334
## iter 100 value 269.809346
## final  value 269.809346
## stopped after 100 iterations
## - Fold10: size=10, decay=0.1
## Aggregating results
## Fitting final model on full training set
## # weights:  591
## initial  value 2799.084262
## iter  10 value 770.280089
## iter  20 value 565.938243
## iter  30 value 466.824209
## iter  40 value 427.723052
## iter  50 value 376.176212
## iter  60 value 355.492191
## iter  70 value 328.556080
## iter  80 value 313.928708
## iter  90 value 301.390170
## iter 100 value 289.590595
## final  value 289.590595
## stopped after 100 iterations

proc.time() - pt

##    user  system elapsed
##  24.965   0.410  26.656

Spamnn_predict_results<-predict(Spam_nn,newdata=spam_test)
Spamnn_predict_results<-as.factor(as.numeric(Spamnn_predict_results)-1)
table(Spamnn_predict_results,Spam_test_label)

##                       Spam_test_label
## Spamnn_predict_results   0   1
##                      0 462  23
##                      1  26 290

error_nn<-sum(Spam_test_label!=Spamnn_predict_results)/nrow(spam_test)
print(paste0("Accuary(Precision) of Neural Network: ",1-error_nn))

## [1] "Accuary(Precision) of Neural Network: 0.938826466916355"

From the final accuracy result we can see Neural Network and C50 are almost the same good.
To achieve an optimal result, we take a 10 time cross validation for Neural Network too.

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s