Bank Loan Estimation with SVM and Logistic Regression

Use the bank marketing dataset from UCI Machine Learning Repository (https://archive.ics.uci.edu/ml/datasets/Bank+Marketing).

 


bank_d<-read.csv("bank.csv",sep=";",head=T)
str(bank_d)

## 'data.frame':    4521 obs. of  17 variables:
##  $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 11 8 5 5 2 5 7 10 3 8 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 2 3 2 2 3 2 2 2 2 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 1 2 3 3 2 3 3 2 3 1 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 2 1 2 1 1 1 1 1 2 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 1 1 1 3 3 1 1 1 3 1 ...
##  $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
##  $ month    : Factor w/ 12 levels "apr","aug","dec",..: 11 9 1 7 9 4 9 9 9 1 ...
##  $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
##  $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
##  $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
##  $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 1 1 4 4 1 2 4 4 1 ...
##  $ y        : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

for (i in 1:17){
if (is.factor(bank_d[,i])==T) {
bank_d[,i]=as.numeric(bank_d[,i])
}
}
bank_d$y<-bank_d$y-1
to_scale<-c(1,6,12,14)
bank_d[,to_scale]<-scale(bank_d[,to_scale])
order<-sample(4521,0.7*4520)
bank_d$y<-as.factor(bank_d$y)
bank_train<-bank_d[order,]
bank_test<-bank_d[-order,]
bank_test_result<-bank_test$y
##############Q1################
set.seed(2017)
bank_lg<-glm(y ~ ., family = binomial, data = bank_train)
bank_lg_predict<-predict(bank_lg,type="response",newdata=bank_test)
bank_lg_predict_result<-rep("1",4521-0.7*4520)
bank_lg_predict_result[bank_lg_predict<0.5]="0"
# since the model of logstic regression generates pro not class
table(bank_lg_predict_result,bank_test_result)

##                       bank_test_result
## bank_lg_predict_result    0    1
##                      0 1152  147
##                      1   27   31

error_rate_lg<-sum(bank_lg_predict_result!=bank_test_result)/nrow(as.data.frame(bank_test_result))
library(ROCR)

## Loading required package: gplots

##
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
##
##     lowess

pred_lg <- ROCR::prediction(bank_lg_predict, bank_test_result)
perf_lg <- performance(pred_lg, 'tpr', 'fpr')
plot(perf_lg)

Auc_lg<- performance(pred_lg, measure = 'auc')@y.values[[1]]
print(paste0("The Area Under the Curve is ",Auc_lg))

## [1] "The Area Under the Curve is 0.83440070141331"

print(paste0("The Accuracy(Precision) is ",1-error_rate_lg))

## [1] "The Accuracy(Precision) is 0.87177597641857"

#############Q2##############
library(e1071)
#linear kernel
bank_svm<-svm(formula=bank_train$y~.,data = bank_train,method="class",kernel="linear")
bank_svm_predict_result<-predict(bank_svm,newdata=bank_test,type="class")
svm_linear_confusion<-table(bank_svm_predict_result,bank_test_result)
svm_linear_confusion

##                        bank_test_result
## bank_svm_predict_result    0    1
##                       0 1179  178
##                       1    0    0

svm_linear_error<-sum(bank_test_result!=bank_svm_predict_result)/nrow(bank_test)
print(paste0("Accuracy(Precision) of SVM with Linear: ",1-svm_linear_error))

## [1] "Accuracy(Precision) of SVM with Linear: 0.868828297715549"

print(paste0("Recall of SVM with Linear: ",svm_linear_confusion[2,2]/sum((svm_linear_confusion)[,2])))

## [1] "Recall of SVM with Linear: 0"

#rbf
bank_svm_rbf<-svm(formula=bank_train$y~.,data = bank_train,method="class",kernel="radial")
bank_svm_rbf_predict_result<-predict(bank_svm_rbf,newdata=bank_test,type="class")
svm_rbf_confusion<-table(bank_svm_rbf_predict_result,bank_test_result)
svm_rbf_confusion

##                            bank_test_result
## bank_svm_rbf_predict_result    0    1
##                           0 1161  156
##                           1   18   22

svm_rbf_error<-sum(bank_test_result!=bank_svm_rbf_predict_result)/nrow(bank_test)
print(paste0("Accuracy(Precision) of SVM with rbf: ",1-svm_rbf_error))

## [1] "Accuracy(Precision) of SVM with rbf: 0.87177597641857"

print(paste0("Recall of SVM with rbf: ",svm_rbf_confusion[2,2]/sum((svm_rbf_confusion)[,2])))

## [1] "Recall of SVM with rbf: 0.123595505617978"

####tune
bank_svm_l1<-svm(formula=bank_train$y~.,data = bank_train,method="class",kernel="linear"
,cost=10^0)
bank_svm_predict_result_l1<-predict(bank_svm_l1,newdata=bank_test,type="class")
svm_linear_error_l1<-sum(bank_test_result!=bank_svm_predict_result_l1)/
nrow(bank_test)
bank_svm_l2<-svm(formula=bank_train$y~.,data = bank_train,method="class",kernel="linear"
,cost=10^(-2))
bank_svm_predict_result_l2<-predict(bank_svm_l2,newdata=bank_test,type="class")
svm_linear_error_l2<-sum(bank_test_result!=bank_svm_predict_result_l2)/
nrow(bank_test)
print(paste0("Accuracy(Precision) of SVM with Linear and Cost of 1: ",1-svm_linear_error_l1))

## [1] "Accuracy(Precision) of SVM with Linear and Cost of 1: 0.868828297715549"

print(paste0("Accuracy(Precision) of SVM with Linear and Cost of 2: ",1-svm_linear_error_l2))

## [1] "Accuracy(Precision) of SVM with Linear and Cost of 2: 0.868828297715549"

#The C parameter tells the SVM optimization how much you
#want to avoid misclassifying each training example. For large values
#of C, the optimization will choose a smaller-margin hyperplane if that
#hyperplane does a better job of getting all the training points classified
#correctly. Conversely, a very small value of C will cause the optimizer to
#look for a larger-margin separating hyperplane, even if that hyperplane
#misclassifies more points and a new integrated corporate philanthropy model.

#rbf
bank_svm_rbf_01<-svm(formula=bank_train$y~.,data = bank_train,method="class",
kernel="radial",cost=10,gamma=1)
bank_svm_rbf_predict_result_01<-predict(bank_svm_rbf_01,newdata=bank_test,type="class")

bank_svm_rbf_02<-svm(formula=bank_train$y~.,data = bank_train,method="class",
kernel="radial",cost=10,gamma=0.1)
bank_svm_rbf_predict_result_02<-predict(bank_svm_rbf_02,newdata=bank_test,type="class")

svm_rbf_error_01<-sum(bank_test_result!=bank_svm_rbf_predict_result_01)/nrow(bank_test)
svm_rbf_error_02<-sum(bank_test_result!=bank_svm_rbf_predict_result_02)/nrow(bank_test)
print(paste0("Accuracy(Precision) of SVM with rbf 01: ",1-svm_rbf_error_01))

## [1] "Accuracy(Precision) of SVM with rbf 01: 0.867354458364038"

print(paste0("Accuracy(Precision) of SVM with rbf 02: ",1-svm_rbf_error_02))

## [1] "Accuracy(Precision) of SVM with rbf 02: 0.862932940309506"

There are no the only best C or Gamma value for SVM since the data and the problem we try to solve are different.

From the observation above, a higher gamma value would result a slightly better accuracy. But the cost would not affect the results.

 

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s