Interpretation Kmeans/DBSCAN, with SNS example and NAs

This dataset was compiled by Brett Lantz while conducting sociological research on the teenage identities at the University of Notre Dame.

Data source: Machine.Learning.with.R.2nd.Edition, Chapter 9.

Dbscan Defines density as the number of points within a specified radius, There is no way to exclude outlier samples Given a specified number of neighboring samples (MinPts) within the radius, every sample point can be tagged as either

1. Core point.

2. Border point.

3. Noise point: all other points

DBscan algorithm would

• Form a separate cluster for each core point or a connected group of core points • Assign each boarder point to the cluster of its corresponding core point • So a DBSCAN cluster is not necessarily a spherical shape (as in K-means) because of connected group of core points • Noise points are discarded


teens<-read.csv("snsdata.csv")
str(teens)

## 'data.frame':    30000 obs. of  40 variables:
##  $ gradyear    : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
##  $ gender      : Factor w/ 2 levels "F","M": 2 1 2 1 NA 1 1 2 1 1 ...
##  $ age         : num  19 18.8 18.3 18.9 19 ...
##  $ friends     : int  7 0 69 0 10 142 72 17 52 39 ...
##  $ basketball  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ football    : int  0 1 1 0 0 0 0 0 0 0 ...
##  $ soccer      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ softball    : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ volleyball  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ swimming    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cheerleading: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ baseball    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ tennis      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sports      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ cute        : int  0 1 0 1 0 0 0 0 0 1 ...
##  $ sex         : int  0 0 0 0 1 1 0 2 0 0 ...
##  $ sexy        : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ hot         : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ kissed      : int  0 0 0 0 5 0 0 0 0 0 ...
##  $ dance       : int  1 0 0 0 1 0 0 0 0 0 ...
##  $ band        : int  0 0 2 0 1 0 1 0 0 0 ...
##  $ marching    : int  0 0 0 0 0 1 1 0 0 0 ...
##  $ music       : int  0 2 1 0 3 2 0 1 0 1 ...
##  $ rock        : int  0 2 0 1 0 0 0 1 0 1 ...
##  $ god         : int  0 1 0 0 1 0 0 0 0 6 ...
##  $ church      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ jesus       : int  0 0 0 0 0 0 0 0 0 2 ...
##  $ bible       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hair        : int  0 6 0 0 1 0 0 0 0 1 ...
##  $ dress       : int  0 4 0 0 0 1 0 0 0 0 ...
##  $ blonde      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mall        : int  0 1 0 0 0 0 2 0 0 0 ...
##  $ shopping    : int  0 0 0 0 2 1 0 0 0 1 ...
##  $ clothes     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hollister   : int  0 0 0 0 0 0 2 0 0 0 ...
##  $ abercrombie : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ die         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ death       : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ drunk       : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ drugs       : int  0 0 0 0 1 0 0 0 0 0 ...

# identify NAs
sum(is.na(teens))

## [1] 7810

summary(teens$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
##   3.086  16.310  17.290  17.990  18.260 106.900    5086

#observe 3 and 106 year old
teens$age <- ifelse(teens$age >= 13 & teens$age < 20,
teens$age, NA)
summary(teens$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's
##   13.03   16.30   17.26   17.25   18.22   20.00    5523

#An easy solution for handling the missing values is to exclude
#any record with a missing value
table(teens$gender, useNA = "ifany")

##
##     F     M  <NA>
## 22054  5222  2724

#the aggregate() function is the tool for the job. It computes
#statistics for subgroups of data. Here, it calculates the mean
#age by graduation year after removing the NA values
#  ave() function, which returns a vector with the group means
# repeated so that the result is equal in length to the original vector:
aggregate(data = teens, age ~ gradyear, mean, na.rm = TRUE)

##   gradyear      age
## 1     2006 18.65586
## 2     2007 17.70617
## 3     2008 16.76770
## 4     2009 15.81957

ave_age <- ave(teens$age, teens$gradyear, FUN =
function(x) mean(x, na.rm = TRUE))
# replace NA with mean
teens$age <- ifelse(is.na(teens$age), ave_age, teens$age)
teens$gender<-ifelse(teens$gender == "F",0,1)
summary(teens$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
##   13.03   16.28   17.24   17.24   18.21   20.00

#model the 36 features that represent the interests
interests<-teens[,5:40]
interests_sdf<-as.data.frame(lapply(interests,scale))

# user kmeans
set.seed(123)
teen_clusters<-kmeans(interests_sdf,5)
#check each group's size
teen_clusters$size

## [1]  2583  4477 21263   990   687

#check the coordinates of the cluster centroids
#the numbers across each row indicate the cluster's average
#value for the interest listed at the top of the column.
#As the values are z-score standardized, positive values are
#above the overall mean level for all the teens and negative
#values are below the overall mean.
teen_clusters$centers

##     basketball    football       soccer    softball  volleyball
## 1  1.366079417  1.17529316  0.476361860  1.20618908  1.11332011
## 2 -0.006342877  0.07044371  0.070298133 -0.06211143 -0.03378958
## 3 -0.187154596 -0.18680193 -0.079356506 -0.14162901 -0.13481522
## 4  0.366956761  0.38410687  0.139266026  0.12755805  0.09991111
## 5  0.168842670  0.35013806  0.006283266  0.06937640  0.06293187
##      swimming cheerleading    baseball      tennis      sports       cute
## 1  0.08300583  -0.08312542  1.11304300  0.11564208  1.12464282 -0.0250709
## 2  0.31984029  -0.07834280 -0.06498358  0.12271196 -0.05672321  0.7425203
## 3 -0.09285629  -0.15200307 -0.13712560 -0.04384667 -0.16074457 -0.1843897
## 4  0.27537789   0.07744270  0.26159478  0.11438259  0.78356954  0.4901231
## 5  0.08071351   5.41605043  0.10577029 -0.04222980 -0.01284514  0.2561155
##             sex         sexy           hot      kissed       dance
## 1 -0.0402756042 -0.004192134  0.0008016923 -0.09392197 -0.01172372
## 2  0.0030847772  0.259154797  0.5431886244 -0.01229063  0.61606628
## 3 -0.0945818898 -0.082923098 -0.1366202488 -0.13354891 -0.15462573
## 4  2.1224136296  0.562625023  0.3070830791  3.14496638  0.43189169
## 5  0.0001842334  0.082659861  0.2431118756  0.03458439  0.19270407
##          band    marching       music        rock         god     church
## 1 -0.05808894 -0.05915918  0.05137051  0.12117303  0.02050123  0.1104702
## 2  0.31178917  0.23670165  0.36842838  0.16079934  0.39956811  0.5492738
## 3 -0.07841503 -0.04460347 -0.13874129 -0.10961423 -0.10842311 -0.1407903
## 4  0.46442486  0.08727753  1.20220515  1.26491434  0.41687142  0.1687260
## 5 -0.05571803 -0.06536590 -0.03241782  0.06633699  0.07405633  0.1195675
##          jesus        bible         hair       dress      blonde
## 1  0.009671393 -0.015331112 -0.005461765 -0.07720504  0.03038766
## 2  0.322264538  0.287591989  0.362556940  0.60493309  0.02645834
## 3 -0.072995552 -0.062419592 -0.200722203 -0.14498022 -0.02885309
## 4  0.102543501  0.073445701  2.610594596  0.51050013  0.37030392
## 5 -0.024997356  0.009559632  0.108308415  0.09964249  0.07271788
##          mall    shopping      clothes   hollister abercrombie         die
## 1 -0.01100531  0.02576077  0.003624177 -0.06415495 -0.06531511 -0.07843801
## 2  0.71776455  0.93294127  0.602627610  0.63973379  0.59918903  0.09219892
## 3 -0.18665726 -0.22878033 -0.188617303 -0.15441295 -0.14955061 -0.09070794
## 4  0.62747827  0.26986108  1.220278810  0.34322167  0.42038245  1.75771036
## 5  0.23680450  0.51539377  0.138527854  0.35679356  0.36369087 -0.03141111
##          death       drunk       drugs
## 1 -0.030192694 -0.07102059 -0.09086939
## 2  0.176206267  0.04486769 -0.04432093
## 3 -0.077193727 -0.08606936 -0.10943422
## 4  0.934049802  1.84219543  2.80112648
## 5  0.008403465 -0.01616748 -0.01903205

# from the results we can see group one is athletes type
# and group four is higher with drunk and death
# group five is cheerleader type and group three seems to be cold to
# everything, or more possibly they did not fill interests in profiles
# group two likes brain activities

# check group one's preference and size
gandi<-as.data.frame(teen_clusters$centers)
gandsize<-as.data.frame(teen_clusters$size)
kkd<-function(k){
for (i in c(1:36)) {
if (gandi[k,i]>0.3){as.vector(print(colnames(gandi)[[i]]))
}
}
}
c(t(gandsize)[[1]],kkd(1))

## [1] "basketball"
## [1] "football"
## [1] "soccer"
## [1] "softball"
## [1] "volleyball"
## [1] "baseball"
## [1] "sports"

## [1] 2583

#specify each teen's attribution
teens$cluster<-teen_clusters$cluster
teens[1:10,c("cluster","gender","age","friends")]

##    cluster gender      age friends
## 1        3      1 18.98200       7
## 2        2      0 18.80100       0
## 3        3      1 18.33500      69
## 4        3      0 18.87500       0
## 5        4     NA 18.99500      10
## 6        3      0 18.65586     142
## 7        2      0 18.93000      72
## 8        3      1 18.32200      17
## 9        3      0 19.05500      52
## 10       2      0 18.70800      39

# or each cluter's attribution: max age, gender(F=0) and friends
aggregate(data = teens, age ~ cluster, mean)

##   cluster      age
## 1       1 17.03618
## 2       2 17.10679
## 3       3 17.30350
## 4       4 17.11253
## 5       5 16.97600

aggregate(data = teens, gender ~ cluster, mean, na.action = na.omit)

##   cluster      gender
## 1       1 0.274397713
## 2       2 0.061229371
## 3       3 0.217138365
## 4       4 0.156316916
## 5       5 0.007911392

aggregate(data = teens, friends ~ cluster, mean)

##   cluster  friends
## 1       1 35.25474
## 2       2 37.50681
## 3       3 27.68664
## 4       4 31.07576
## 5       5 39.20961

# Use dbscan
library(dbscan)
teen_clusters2<-dbscan(interests_sdf,eps = 2.1,minPts = 200)
teen_clusters2

## DBSCAN clustering for 30000 objects.
## Parameters: eps = 2.1, minPts = 200
## The clustering contains 4 cluster(s) and 16124 noise points.
##
##     0     1     2     3     4
## 16124 12493   448   329   606
##
## Available fields: cluster, eps, minPts

teens$cluster2<-teen_clusters2$cluster
aggregate(data = teens, age ~ cluster2, mean)

##   cluster2      age
## 1        0 17.16231
## 2        1 17.32968
## 3        2 17.35213
## 4        3 17.33778
## 5        4 17.18983

aggregate(data = teens, gender ~ cluster2, mean, na.action = na.omit)

##   cluster2    gender
## 1        0 0.1507641
## 2        1 0.2379213
## 3        2 0.2203390
## 4        3 0.1423841
## 5        4 0.3807829

aggregate(data = teens, friends ~ cluster2, mean)

##   cluster2  friends
## 1        0 34.32678
## 2        1 25.39598
## 3        2 27.13170
## 4        3 25.35866
## 5        4 23.31518

 

 

 

 

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s