Preprocess: t-SNE in Python

Posted on May 29, 2017 by charleshsliao

We use the data from sklearn library, and the IDE is sublime text3. Most of the code comes from the book: https://www.goodreads.com/book/show/32439431-introduction-to-machine-learning-with-python?from_search=true

###There is a class of algorithms for visualization called manifold learning algorithms 
###which allows for much more complex mappings, and often provides better visualizations compared with PCA. 
###A particular useful one is the t-SNE algorithm.

from sklearn.datasets import load_digits
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
colors = ["#476A2A", "#7851B8", "#BD3430", "#4A2D4E", "#875525",
          "#A83683", "#4E655E", "#853541", "#3A3120","#535D8E"]
###1. Plot the digits from sklearn
import pandas as pd
digits=load_digits()
fig,axes=plt.subplots(2,5,figsize=(10,5), subplot_kw={'xticks':(),'yticks':()})
for ax,img in zip(axes.ravel(),digits.images):
	ax.imshow(img)
plt.show()

###2. PCA
pca=PCA(n_components=2)
digits01=digits.data
pca.fit(digits01)
digits_pca = pca.transform(digits01)
def rstr(df):
	return df.shape, df.apply(lambda x:[x.unique()])
print('\n''structure of data:''\n',
	rstr(pd.DataFrame(digits.data)))

plt.figure(figsize=(10, 10))
plt.xlim(digits_pca[:, 0].min(), digits_pca[:, 0].max())
plt.ylim(digits_pca[:, 1].min(), digits_pca[:, 1].max())
for i in range(len(digits.data)):
    # actually plot the digits as text instead of using scatter
    plt.text(digits_pca[i, 0], digits_pca[i, 1], str(digits.target[i]),
             color = colors[digits.target[i]],
             fontdict={'weight': 'bold', 'size': 6})
plt.xlabel("first principal component")
plt.ylabel("second principal component")
plt.show()


###3. plot with t_SNE
from sklearn.manifold import TSNE
tsne = TSNE(random_state=42,perplexity=50)
digits02=digits.data
# use fit_transform instead of fit, as TSNE has no transform method:
digits_tsne = tsne.fit_transform(digits02)
plt.figure(figsize=(10, 10))
plt.xlim(digits_tsne[:, 0].min(), digits_tsne[:, 0].max() + 1)
plt.ylim(digits_tsne[:, 1].min(), digits_tsne[:, 1].max() + 1)
for i in range(len(digits.data)):
    # actually plot the digits as text instead of using scatter
    plt.text(digits_tsne[i, 0], digits_tsne[i, 1], str(digits.target[i]),
             color = colors[digits.target[i]],
             fontdict={'weight': 'bold', 'size': 9})
plt.show()

We can see more about TSNE here: http://distill.pub/2016/misread-tsne/

Published by charleshsliao

View all posts by charleshsliao

Leave a comment Cancel reply