DAY 13 - Mar 9, 2017



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt



In [2]:

    
%matplotlib inline

Dataset

https://www.kaggle.com/ludobenistant/hr-analytics



In [3]:

    
!head HR_comma_sep.csv



In [4]:

    
data = pd.read_csv("HR_comma_sep.csv")

print(data.shape)
data.head()









    



(14999, 10)






    Out[4]:






  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      sales
      salary
    
  
  
    
      0
      0.38
      0.53
      2
      157
      3
      0
      1
      0
      sales
      low
    
    
      1
      0.80
      0.86
      5
      262
      6
      0
      1
      0
      sales
      medium
    
    
      2
      0.11
      0.88
      7
      272
      4
      0
      1
      0
      sales
      medium
    
    
      3
      0.72
      0.87
      5
      223
      5
      0
      1
      0
      sales
      low
    
    
      4
      0.37
      0.52
      2
      159
      3
      0
      1
      0
      sales
      low



In [5]:

    
# What is salary
set(data["salary"])









    Out[5]:





{'high', 'low', 'medium'}



In [6]:

    
# Select n samples to cluster
X = data.sample(n=200, random_state=1)
X.head()









    Out[6]:






  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
      sales
      salary
    
  
  
    
      11908
      0.67
      0.39
      2
      207
      7
      0
      0
      0
      sales
      medium
    
    
      10508
      0.51
      0.98
      4
      245
      3
      0
      0
      0
      support
      medium
    
    
      9193
      0.66
      0.54
      2
      136
      2
      0
      0
      0
      technical
      low
    
    
      13087
      0.53
      0.81
      3
      275
      2
      0
      0
      0
      technical
      low
    
    
      506
      0.83
      0.98
      4
      259
      5
      0
      1
      0
      support
      medium



In [7]:

    
# Using only certain features
features = X.iloc[:,:-2]
features.head()









    Out[7]:






  
    
      
      satisfaction_level
      last_evaluation
      number_project
      average_montly_hours
      time_spend_company
      Work_accident
      left
      promotion_last_5years
    
  
  
    
      11908
      0.67
      0.39
      2
      207
      7
      0
      0
      0
    
    
      10508
      0.51
      0.98
      4
      245
      3
      0
      0
      0
    
    
      9193
      0.66
      0.54
      2
      136
      2
      0
      0
      0
    
    
      13087
      0.53
      0.81
      3
      275
      2
      0
      0
      0
    
    
      506
      0.83
      0.98
      4
      259
      5
      0
      1
      0

TSNE



In [8]:

    
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0, perplexity=7).fit_transform(features)

plt.scatter(tsne[:,0], tsne[:,1])
plt.xlabel("tsne_1")
plt.ylabel("tsne_2")









    Out[8]:





<matplotlib.text.Text at 0x10cbedb38>

K-means clustering

http://scikit-learn.org/stable/modules/clustering.html



In [9]:

    
from sklearn.cluster import KMeans



In [10]:

    
y_pred = KMeans(n_clusters=3, random_state=1).fit_predict(tsne)

plt.scatter(tsne[:,0], tsne[:,1], c=y_pred)
plt.xlabel("tsne_1")
plt.ylabel("tsne_2")









    Out[10]:





<matplotlib.text.Text at 0x1108b60b8>



In [ ]:

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	sales	salary
0	0.38	0.53	2	157	3	1	sales	low
1	0.80	0.86	5	262	6	1	sales	medium
2	0.11	0.88	7	272	4	1	sales	medium
3	0.72	0.87	5	223	5	1	sales	low
4	0.37	0.52	2	159	3	1	sales	low

	satisfaction_level	last_evaluation	number_project	average_montly_hours	time_spend_company	left	sales	salary
11908	0.67	0.39	2	207	7	0	sales	medium
10508	0.51	0.98	4	245	3	0	support	medium
9193	0.66	0.54	2	136	2	0	technical	low
13087	0.53	0.81	3	275	2	0	technical	low
506	0.83	0.98	4	259	5	1	support	medium