t-distributed Stochastic Neighbor Embedding.
t-SNE [1] is a tool to visualize high-dimensional data and it's a great tool to inspect datasets.
It converts affinities of data points to probabilities. http://scikit-learn.org/stable/modules/manifold.html#t-sne
https://distill.pub/2016/misread-tsne/
Make sure the same scale is used over all features.
In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler,Normalizer, RobustScaler
from sklearn.pipeline import make_pipeline
In [2]:
iris=datasets.load_iris()
X=iris.data
y=iris.target
perplexity,” which says (loosely) how to balance attention between local and global aspects of your data. The parameter is, in a sense, a guess about the number of close neighbors each point has.
In [51]:
#default 2 components
model= TSNE(learning_rate=50,init='pca',perplexity=45 )
transformed = model.fit_transform(X)
plt.scatter(transformed[:,0], transformed[:,1], c=y)
plt.show()
In [53]:
normalizer = Normalizer()
model= TSNE(learning_rate=100,init='pca',perplexity=30)
pipeline = make_pipeline(normalizer,model)
transformed=pipeline.fit_transform(X)
plt.scatter(transformed[:,0], transformed[:,1], c=y)
plt.show()
In [41]:
scaler = StandardScaler()
model= TSNE(learning_rate=50,init='pca')
pipeline = make_pipeline(scaler,model)
transformed=pipeline.fit_transform(X)
plt.scatter(transformed[:,0], transformed[:,1], c=y)
plt.show()