A SMS Spam Test with Naive Bayes in R, with Text Processing

SMS, or Short Message Service, always contains fraud messages from God-knows-where. With Naive Bayes we can build a classifier to predict the message to be a spam or not, based on NLP(nature language processing).

data: http://www.dt.fee.unicamp.br/~tiago/smsspamcollection


#1.collect data
rawtext<-read.csv("HamorSpam.csv",header=F,sep=",",stringsAsFactors = F)
str(rawtext)
library(dplyr)
rawtext<-rename(rawtext,type=V1)
rawtext<-rename(rawtext,msg=V2)
class(rawtext$type)
rawtext$type<-as.factor(rawtext$type)
str(rawtext$type) # with ham=1, spam=2
table(rawtext$type) # count
#2.1 corpus with tm
library(tm)
msgcorpus<-VCorpus(VectorSource(rawtext$msg))
msgcorpus <- tm_map(msgcorpus, content_transformer(function(x)
iconv(x, to='UTF-8-MAC', sub='byte')),mc.cores=1)

The above step is EXTREMELY important for MAC users!


msgcorpu
class(msgcorpus) # actaully this is a list, complex thougth
inspect(msgcorpus[1:2]) # inspect the first two messages
as.character(msgcorpus[[1]]) # To view the actual 1st message as the 1st element
lapply(msgcorpus[1:3],as.character)
library(SnowballC)
#2.12 lower msg
msgclean<-tm_map(msgcorpus,content_transformer(tolower))
#2.13 remove numbers
msgclean<-tm_map(msgclean,removeNumbers)
#2.14 remove stop words and punctuation
msgclean<-tm_map(msgclean,removeWords,stopwords())
msgclean<-tm_map(msgclean,removePunctuation)
#2.15 stem and clean space
msgclean<-tm_map(msgclean,stemDocument)
msgclean<-tm_map(msgclean, stripWhitespace)
as.character(msgclean[[1]]) # check the result
# 2.16 to words
msgdtm<-DocumentTermMatrix(msgclean)
msgdtm # check words and rows
# 3 build traing and test datasets
msgtrain<-msgdtm[1:4572,]
msgtest<-msgdtm[4573:5572,]
trainglb<-rawtext$type[1:4572]
testlb<-rawtext$type[4573:5572]
#ensure they are presentative enough
prop.table(table(trainglb))
prop.table(table(testlb))
# word cloud for cleaned messages
library(wordcloud)
wordcloud(msgclean, min.freq = 50, random.order = FALSE)

screen-shot-2017-03-02-at-8-37-03-pm


# word cloud for ham messages
hammsg<-subset(rawtext,type=="ham")
hammsg<-VCorpus(VectorSource(hammsg[,-1]))
hammsg <- tm_map(hammsg, content_transformer(function(x)
iconv(x, to='UTF-8-MAC', sub='byte')),mc.cores=1)
wordcloud(hammsg, max.words=90, scale=c(3,0.5))
# word cloud for spam messages
spammsg<-subset(rawtext,type=="spam")
spammsg<-VCorpus(VectorSource(spammsg[,-1]))
spammsg <- tm_map(spammsg, content_transformer(function(x)
iconv(x, to='UTF-8-MAC', sub='byte')),mc.cores=1)
wordcloud(spammsg, max.words=90, scale=c(3,0.5))

HAM|SPAM

# 4. create indicator features for frequent words
#Eliminate any word that appear in less than five SMS messages,
#or in less than about 0.1 percent of the records in the training data.
freqwords<-findFreqTerms(msgtrain,5)
str(freqwords)
freqtrain<-msgtrain[,freqwords]
freqtest<-msgtest[,freqwords]
#The Naive Bayes classifier is typically trained on data with
#categorical features. This poses a problem, since the cells
#in the sparse matrix are numeric and measure the number of times
#a word appears in a message. We need to change this to a categorical
#variable that simply indicates yes or no depending on whether
#the word appears at all.
appear_counts<-function(x){
x<-ifelse(x>0,"Yes","No")
}
msgtrainc<-apply(freqtrain,MARGIN = 2,appear_counts)
msgtestc<-apply(freqtest,MARGIN = 2,appear_counts)
# 5. Train model
library(e1071)
NBsclassifier<-naiveBayes(msgtrainc,trainglb)
# 6. Evaluate and Tune
pt <- proc.time()
NBspr<-predict(NBsclassifier,msgtestc)
table(NBspr,testlb)
proc.time()-pt
#set a value for the Laplace estimator.
#This allows words that appeared in zero spam or zero ham messages
#to have an indisputable say in the classification process.
#Just because the word "ringtone" only appeared in the spam messages
#in the training data, it does not mean that every message with this
#word should be classified as spam.
NBsclassifier2<-naiveBayes(msgtrainc,trainglb,laplace=1)
pt <- proc.time()
NBspr2<-predict(NBsclassifier2,msgtestc)
table(NBspr2,testlb)
##     testlb
#NBspr2 ham spam
#   ham 862 16
#   spam 5 117
proc.time()-pt
#user system elapsed
#27.841 0.259 29.167
Advertisements

1 thought on “A SMS Spam Test with Naive Bayes in R, with Text Processing”

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s