Denoise with Auto Encoder of H2O in Python for MNIST

We talked about auto-encoder here and here with R (https://charleshsliao.wordpress.com/2017/04/14/identify-arguments-of-h2o-deep-learning-model-with-tuned-auto-encoder-in-r-with-mnist/).

We also talked about the three functions of auto encoder above.
This is a pretty standard example used for benchmarking anomaly detection models.

We use Python3 and H2O framework to build auto-encoder. More details can be found in Sebastian Raschka’s book: https://www.goodreads.com/book/show/25545994-python-machine-learning?ac=1&from_search=true

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
import numpy as np
rcParams['figure.figsize'] = 20, 12

import h2o
# Start H2O on the local machine
h2o.init()
# Import MNIST from H2O
train_with_label=h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/mnist/train.csv.gz")
test_with_label=h2o.import_file("http://h2o-public-test-data.s3.amazonaws.com/bigdata/laptop/mnist/test.csv.gz")

print(train_with_label.shape)
###(60000, 785)

predictors = list(range(0,784))
train = train_with_label[predictors]
test = test_with_label[predictors]

from h2o.estimators.deeplearning import H2OAutoEncoderEstimator

model = H2OAutoEncoderEstimator(activation="Tanh", hidden=[20],
                                ignore_const_cols=False, epochs=100)
model.train(x=predictors,training_frame=train)
###deeplearning Model Build progress: [######################################] 100%

model._model_json['output']
test_rec_error = model.anomaly(test)
test_rec_error.describe()
###
#Rows:10000
#Cols:1
#
#
         #Reconstruction.MSE
#-------  --------------------
#type     real
#mins     0.005928091441463744
#mean     0.024239994005425435
#maxs     1.5853027245900233
#sigma    0.018023693408600293
#zeros    0
#missing  0
#0        0.02259901540014912
#1        0.020429460694658674
#2        0.04361697763998575
#3        0.022013397380102426
#4        0.009571660434791622
#5        0.030431115990108004
#6        0.025375366981211355
#7        0.012204461492973908
#8        0.021209901035720125
#9        0.030681865713517453

test_rec_error_df = test_rec_error.as_data_frame()
test_rec_error_df['id'] = test_rec_error_df.index
test_rec_error_df.plot(kind='scatter', x='id', y='Reconstruction.MSE')
plt.show()

Screen Shot 2017-06-25 at 8.46.51 PM.png

test_with_error = test_with_label.cbind(test_rec_error)
outlier = test_with_error[test_with_error['Reconstruction.MSE'] > 1.0][0, :]
outlier_recon = model.predict(outlier[predictors]).cbind(outlier['Reconstruction.MSE'])
###deeplearning prediction progress: [#######################################] 100%
print(outlier_recon)
###reconstr_C1    reconstr_C2    reconstr_C3    reconstr_C4    
###reconstr_C5    reconstr_C6    reconstr_C7    reconstr_C8    reconstr_C9 
###SKIPPED

test_rec_error.as_data_frame().hist(bins=1000, range=[0.0, 0.07])
###We can extract good digits for future plot
digits_of_3 = test_with_error[(test_with_error['C785'] == 3) & (test_with_error['Reconstruction.MSE'] < 0.02)]
good_digit_of_3 = digits_of_3[1, :]

###We define the function to add noise for denosing 
def add_noise(data, p):
    n_rows = data.shape[0]
    n_columns = len(predictors)
    mask = h2o.H2OFrame(np.random.binomial(1, 1.0-p, (n_rows, n_columns)).tolist())
    noisy_data = data[predictors] * mask
    if (data.shape[1] > n_columns):
        return noisy_data.cbind(data[n_columns:])
    else:
        return noisy_data

###We denoise the data with H2OAutoEncoderEstimator
denoised_model = H2OAutoEncoderEstimator(activation="Tanh", 
	hidden=[20], l1=1e-5, ignore_const_cols=False, epochs=1)

denoised_model.train(x=predictors,training_frame=add_noise(train, 0.3))
denoised_test_rec_error = denoised_model.anomaly(test)
denoised_test_rec_error_df = denoised_test_rec_error.as_data_frame()

denoised_test_rec_error_df['id'] = denoised_test_rec_error_df.index
denoised_test_rec_error_df.plot(kind = 'scatter', x = 'id', 
	y='Reconstruction.MSE', title = 'Reconstruction error of denoised test digits')
plt.show()
denoised_test_rec_error.describe()
denoised_test_rec_error_df.hist(bins=1000, range=[0.0, 0.15])
plt.show()

Screen Shot 2017-06-25 at 8.52.26 PM.png

test_ugly = test_with_error[test_with_error['Reconstruction.MSE'] > 0.10]
test_ugly.group_by('C785').count().frame.as_data_frame().plot(
	kind='bar', x = 'C785', y='nrow_C1', title = 'ugly')
plt.show()
###We can also plot good and bad digtis hists

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s