DAY 13 - Mar 9, 2017


In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
%matplotlib inline

In [3]:
!head HR_comma_sep.csv












In [4]:
data = pd.read_csv("HR_comma_sep.csv")

print(data.shape)
data.head()


(14999, 10)
Out[4]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low

In [5]:
# What is salary
set(data["salary"])


Out[5]:
{'high', 'low', 'medium'}

In [6]:
# Select n samples to cluster
X = data.sample(n=200, random_state=1)
X.head()


Out[6]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
11908 0.67 0.39 2 207 7 0 0 0 sales medium
10508 0.51 0.98 4 245 3 0 0 0 support medium
9193 0.66 0.54 2 136 2 0 0 0 technical low
13087 0.53 0.81 3 275 2 0 0 0 technical low
506 0.83 0.98 4 259 5 0 1 0 support medium

In [7]:
# Using only certain features
features = X.iloc[:,:-2]
features.head()


Out[7]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years
11908 0.67 0.39 2 207 7 0 0 0
10508 0.51 0.98 4 245 3 0 0 0
9193 0.66 0.54 2 136 2 0 0 0
13087 0.53 0.81 3 275 2 0 0 0
506 0.83 0.98 4 259 5 0 1 0

TSNE


In [8]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0, perplexity=7).fit_transform(features)

plt.scatter(tsne[:,0], tsne[:,1])
plt.xlabel("tsne_1")
plt.ylabel("tsne_2")


Out[8]:
<matplotlib.text.Text at 0x10cbedb38>

In [9]:
from sklearn.cluster import KMeans

In [10]:
y_pred = KMeans(n_clusters=3, random_state=1).fit_predict(tsne)

plt.scatter(tsne[:,0], tsne[:,1], c=y_pred)
plt.xlabel("tsne_1")
plt.ylabel("tsne_2")


Out[10]:
<matplotlib.text.Text at 0x1108b60b8>

In [ ]: