Preprocess: t-SNE in Python

We use the data from sklearn library, and the IDE is sublime text3. Most of the code comes from the book: https://www.goodreads.com/book/show/32439431-introduction-to-machine-learning-with-python?from_search=true

###There is a class of algorithms for visualization called manifold learning algorithms 
###which allows for much more complex mappings, and often provides better visualizations compared with PCA. 
###A particular useful one is the t-SNE algorithm.

from sklearn.datasets import load_digits
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
colors = ["#476A2A", "#7851B8", "#BD3430", "#4A2D4E", "#875525",
          "#A83683", "#4E655E", "#853541", "#3A3120","#535D8E"]
###1. Plot the digits from sklearn
import pandas as pd
digits=load_digits()
fig,axes=plt.subplots(2,5,figsize=(10,5), subplot_kw={'xticks':(),'yticks':()})
for ax,img in zip(axes.ravel(),digits.images):
	ax.imshow(img)
plt.show()

figure_1.png

###2. PCA
pca=PCA(n_components=2)
digits01=digits.data
pca.fit(digits01)
digits_pca = pca.transform(digits01)
def rstr(df):
	return df.shape, df.apply(lambda x:[x.unique()])
print('\n''structure of data:''\n',
	rstr(pd.DataFrame(digits.data)))

plt.figure(figsize=(10, 10))
plt.xlim(digits_pca[:, 0].min(), digits_pca[:, 0].max())
plt.ylim(digits_pca[:, 1].min(), digits_pca[:, 1].max())
for i in range(len(digits.data)):
    # actually plot the digits as text instead of using scatter
    plt.text(digits_pca[i, 0], digits_pca[i, 1], str(digits.target[i]),
             color = colors[digits.target[i]],
             fontdict={'weight': 'bold', 'size': 6})
plt.xlabel("first principal component")
plt.ylabel("second principal component")
plt.show()

Screen Shot 2017-05-29 at 1.35.35 PM.png


###3. plot with t_SNE
from sklearn.manifold import TSNE
tsne = TSNE(random_state=42,perplexity=50)
digits02=digits.data
# use fit_transform instead of fit, as TSNE has no transform method:
digits_tsne = tsne.fit_transform(digits02)
plt.figure(figsize=(10, 10))
plt.xlim(digits_tsne[:, 0].min(), digits_tsne[:, 0].max() + 1)
plt.ylim(digits_tsne[:, 1].min(), digits_tsne[:, 1].max() + 1)
for i in range(len(digits.data)):
    # actually plot the digits as text instead of using scatter
    plt.text(digits_tsne[i, 0], digits_tsne[i, 1], str(digits.target[i]),
             color = colors[digits.target[i]],
             fontdict={'weight': 'bold', 'size': 9})
plt.show()

Screen Shot 2017-05-29 at 2.21.25 PM.png
We can see more about TSNE here: http://distill.pub/2016/misread-tsne/

Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s