DAY 13 - Mar 9, 2017
In [1]:
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
%matplotlib inline
In [3]:
!head HR_comma_sep.csv
In [4]:
data = pd.read_csv("HR_comma_sep.csv")
print(data.shape)
data.head()
Out[4]:
In [5]:
# What is salary
set(data["salary"])
Out[5]:
In [6]:
# Select n samples to cluster
X = data.sample(n=200, random_state=1)
X.head()
Out[6]:
In [7]:
# Using only certain features
features = X.iloc[:,:-2]
features.head()
Out[7]:
In [8]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0, perplexity=7).fit_transform(features)
plt.scatter(tsne[:,0], tsne[:,1])
plt.xlabel("tsne_1")
plt.ylabel("tsne_2")
Out[8]:
In [9]:
from sklearn.cluster import KMeans
In [10]:
y_pred = KMeans(n_clusters=3, random_state=1).fit_predict(tsne)
plt.scatter(tsne[:,0], tsne[:,1], c=y_pred)
plt.xlabel("tsne_1")
plt.ylabel("tsne_2")
Out[10]:
In [ ]: