Getting Started


In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

pd.set_option('display.max_columns', 200)

In [2]:
# Data cleaning: standardization, missing data done on the cheap outside of the Python environment

In [3]:
survey_df_raw = pd.read_csv(filepath_or_buffer='chadev_survey_cleaned.csv', true_values=['Yes'], false_values=['No'], na_values=['-1'])

In [4]:
survey_df_raw.tail()


Out[4]:
Timestamp YEARS_EXP NUM_HOURS_WORK DES_NUM_HOURS_WORK DAYS_OFF SALARY COMP_SIZE HEALTH_INS HEALTH_INS_PCT STACK COMP_MEALS CONF_BUDGET PAID_LUNCH_HOUR PUBLIC_SPEAKING FREE_TESLA 401K TRAINING_BUDGET RIGHT_LEFT_BRAIN BOOKS_READ HIGH_SCHOOL DEGREE CAREER_DEGREE WORK_HIST_COUNT
62 12/12/2015 23:22:47 11.0 45 35 20 115000 250 True 80 .net; JavaScript; Linux; Clojure; Java; Csharp... 4 5000 True True True True True True 20 True True True 5
63 12/13/2015 15:11:48 12.0 45 45 15 95000 10 True 90 NaN 2 0 True True True True True True 50 True True True 6
64 12/13/2015 16:44:02 7.0 30 0 0 96000 1 False 0 Postgres; Node; Rails.js; Xcode; Android; Java... 0 2000 False True True False False True 16 True False False 4
65 12/15/2015 13:18:14 0.5 40 30 12 40000 1 True NaN NaN 0 0 True False True False False False 4 True True True 1
66 1/4/2016 13:18:12 2.0 40 40 10 30000 10 False 0 Angular; MySQL; Javascript 0 0 False False True False False True 3 True False False 2

In [5]:
# Binning: Categorize company size by number of employees

In [6]:
def companySizeCat(numEmployees):
    if numEmployees < 7: # micro
        return 1
    elif numEmployees < 250: # small
        return 2
    elif numEmployees < 500: # medium
        return 3
    elif numEmployees < 1000: # large
        return 4
    else: # enterprise
        return 5

In [7]:
survey_df_raw['COMP_SIZE'] = survey_df_raw['COMP_SIZE'].apply(lambda x: companySizeCat(x))

In [8]:
survey_df_raw.head()


Out[8]:
Timestamp YEARS_EXP NUM_HOURS_WORK DES_NUM_HOURS_WORK DAYS_OFF SALARY COMP_SIZE HEALTH_INS HEALTH_INS_PCT STACK COMP_MEALS CONF_BUDGET PAID_LUNCH_HOUR PUBLIC_SPEAKING FREE_TESLA 401K TRAINING_BUDGET RIGHT_LEFT_BRAIN BOOKS_READ HIGH_SCHOOL DEGREE CAREER_DEGREE WORK_HIST_COUNT
0 9/4/2015 23:08:05 5 35 32 20 70000 1 True 100 rails; angular; javascript; ruby 4 0 True False True False False True 0 True True True 8
1 9/4/2015 23:56:55 10 40 30 0 120000 1 False 0 linux; apache; mysql; php; lemp; javascript; f... 4 3000 True True True False True False 3 True False False 6
2 9/5/2015 0:34:15 11 45 40 15 75000 2 True 25 .NET; Puppet; Java; Windows; Debian; Node; Jav... 0 0 False False True False False False 2 True True True 6
3 9/5/2015 8:36:17 6 40 20 16 77000 2 True 50 Linux; PHP; Nginx; Postgresql 0 0 True False True False False False 10 True True True 3
4 9/5/2015 12:07:20 6 50 60 14 70000 2 True 80 Ruby; Rails; PHP; Magento; Nginx; Linux; OSX 0 0 False False True False False False 3 True False False 3

In [9]:
# Drop columns that don't give any particular insight (subjectively chosen)

In [10]:
survey_df_raw.drop(['Timestamp', 'DES_NUM_HOURS_WORK', 'HEALTH_INS_PCT', 'PAID_LUNCH_HOUR', 
                    'PUBLIC_SPEAKING', 'FREE_TESLA', 'RIGHT_LEFT_BRAIN', 'HIGH_SCHOOL'], axis=1, inplace=True);

In [11]:
survey_df_raw.columns


Out[11]:
Index([u'YEARS_EXP', u'NUM_HOURS_WORK', u'DAYS_OFF', u'SALARY', u'COMP_SIZE',
       u'HEALTH_INS', u'STACK', u'COMP_MEALS', u'CONF_BUDGET', u'401K',
       u'TRAINING_BUDGET', u'BOOKS_READ', u'DEGREE', u'CAREER_DEGREE',
       u'WORK_HIST_COUNT'],
      dtype='object')

In [12]:
# Let's try to understand the data

In [13]:
survey_df_raw.describe()


Out[13]:
YEARS_EXP NUM_HOURS_WORK DAYS_OFF SALARY COMP_SIZE HEALTH_INS COMP_MEALS CONF_BUDGET 401K TRAINING_BUDGET BOOKS_READ DEGREE CAREER_DEGREE WORK_HIST_COUNT
count 67.000000 67.000000 67.000000 67.000000 67.000000 67 67.000000 67.000000 67 67 67.000000 67 67 67.000000
mean 9.305970 43.044776 21.059701 82020.149254 2.492537 0.880597 1.343284 5302.238806 0.6268657 0.3731343 8.828358 0.7761194 0.6119403 5.328358
std 7.335032 6.883077 16.737619 28621.548435 1.172580 0.3267094 2.421821 27144.660774 0.4872875 0.4872875 10.460804 0.419989 0.4909861 4.247595
min 0.500000 20.000000 0.000000 20000.000000 1.000000 False 0.000000 0.000000 False False 0.000000 False False 1.000000
25% 4.000000 40.000000 14.000000 65000.000000 2.000000 1 0.000000 0.000000 0 0 2.500000 1 0 3.000000
50% 7.000000 40.000000 20.000000 77000.000000 2.000000 1 0.000000 0.000000 1 0 5.000000 1 1 4.000000
75% 12.000000 47.000000 23.500000 95000.000000 3.000000 1 2.000000 475.000000 1 1 10.000000 1 1 6.000000
max 30.000000 65.000000 100.000000 185000.000000 5.000000 True 10.000000 200000.000000 True True 50.000000 True True 27.000000

Now on to questions

Question 1: What is the most common company size, years of experience, number of jobs, number of books read, salary


In [14]:
sel_columns_df = survey_df_raw[['COMP_SIZE','YEARS_EXP', 'WORK_HIST_COUNT', 'BOOKS_READ', 'SALARY']]
sel_columns_df.mode()


Out[14]:
COMP_SIZE YEARS_EXP WORK_HIST_COUNT BOOKS_READ SALARY
0 2 10 3 3 75000
1 NaN NaN 4 NaN NaN

Question 2: What are the most common stack tools?

To answer this we need to first vectorize the free-text in the stack


In [15]:
# Define a function to tokenize the raw text entered by the users 
def tokenizr(text):
    tokens = text.split(";");
    return [ token.strip() for token in tokens if token.isspace() == False ]

In [16]:
# Vectorize the text using sklearn provided CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tools = survey_df_raw['STACK'];
tools.fillna('', inplace=True);

tf_vectorizer = CountVectorizer(tokenizer=tokenizr);
tools_matrix = tf_vectorizer.fit_transform(tools);

In [17]:
# Produces a matrix
tools_matrix.shape


Out[17]:
(67, 161)

In [18]:
tf_vectorizer.get_feature_names()


Out[18]:
[u'',
 u'.net',
 u'airflow',
 u'android',
 u'anger',
 u'angular',
 u'ansible',
 u'apache',
 u'apex',
 u'asp .net',
 u'asp mvc',
 u'atom',
 u'aws',
 u'baby bottles & burping cloths.',
 u'bash',
 u'bootstrap',
 u'brainpower',
 u'c++',
 u'centos',
 u'chairs',
 u'classic asp',
 u'clojure',
 u'cognos',
 u'computers',
 u'consul',
 u'csharp',
 u'css',
 u'css3',
 u'db2 for z/os',
 u'db2 luw',
 u'debian',
 u'digitalocean',
 u'divshot',
 u'django',
 u'docker',
 u'drupal',
 u'ec2',
 u'eclipse ide',
 u'elasticache',
 u'elasticsearch',
 u'elixir',
 u'ember',
 u'es6',
 u'excel',
 u'express',
 u'express.js',
 u'flask',
 u'flux',
 u'force.com platform',
 u'fsharp',
 u'giphy',
 u'git',
 u'github',
 u'go',
 u'google',
 u'google docs',
 u'google drive',
 u'google hangouts',
 u'gotomeeting',
 u'graphite',
 u'gulp',
 u'hand tools',
 u'hate',
 u'heroku',
 u'hipchat',
 u'hl7',
 u'html',
 u'html5',
 u'iis',
 u'ims',
 u'internet',
 u'java',
 u'javascript',
 u'jekyll',
 u'jenkins',
 u'jira',
 u'jquery',
 u'jruby',
 u'keras',
 u'kibana',
 u'knockout',
 u'knockout.js',
 u'kubernetes',
 u'lasers',
 u'lemp',
 u'line scan cameras',
 u'linux',
 u'magento',
 u'mercurial',
 u'mesos',
 u'microsoft',
 u'mongodb',
 u'mssql',
 u'mysql',
 u'netezza',
 u'nginx',
 u'node',
 u'obj c',
 u'oracle',
 u'osx',
 u'outlook',
 u'perl',
 u'phone',
 u'php',
 u'plc/mlc',
 u'postgres',
 u'postgresql',
 u'power tools',
 u'powershell',
 u'puppet',
 u'python',
 u'rabbitmq',
 u'rackspace',
 u'rails',
 u'rails.js',
 u'rds',
 u'react',
 u'redis',
 u'redshift',
 u'redux',
 u'rhel',
 u'ruby',
 u's3',
 u'sails',
 u'salesforce',
 u'sass',
 u'scikit-learn',
 u'scss',
 u'segment.io',
 u'selenium',
 u'server 2012',
 u'sinatra',
 u'skype',
 u'slack',
 u'sql',
 u'sql server',
 u'ssh',
 u'stack overflow',
 u'sublime text',
 u'surreal.cms',
 u'swift',
 u'teamcity',
 u'teradata',
 u'terminals',
 u'torquebox',
 u'ubuntu',
 u'vagrant',
 u'vb',
 u'vba',
 u'vero',
 u'vim',
 u'visual studio',
 u'visual studio 2012',
 u'vmware',
 u'windows',
 u'windows batch files',
 u'windows server',
 u'wordpress',
 u'wp',
 u'xcode',
 u'xslt']

In [19]:
# Turn the matrix into a dataframe
tools_matrix_df = DataFrame(data=tools_matrix.toarray(), columns=tf_vectorizer.get_feature_names())

# Join the dataframe to the survey data frame
survey_df_wTM = survey_df_raw.join(tools_matrix_df)
survey_df_wTM.drop(['STACK'], axis=1, inplace=True)

# visualize
survey_df_wTM.head()


Out[19]:
YEARS_EXP NUM_HOURS_WORK DAYS_OFF SALARY COMP_SIZE HEALTH_INS COMP_MEALS CONF_BUDGET 401K TRAINING_BUDGET BOOKS_READ DEGREE CAREER_DEGREE WORK_HIST_COUNT .net airflow android anger angular ansible apache apex asp .net asp mvc atom aws baby bottles & burping cloths. bash bootstrap brainpower c++ centos chairs classic asp clojure cognos computers consul csharp css css3 db2 for z/os db2 luw debian digitalocean divshot django docker drupal ec2 eclipse ide elasticache elasticsearch elixir ember es6 excel express express.js flask flux force.com platform fsharp giphy git github go google google docs google drive google hangouts gotomeeting graphite gulp hand tools hate heroku hipchat hl7 html html5 iis ims internet java javascript jekyll jenkins jira jquery jruby keras kibana knockout knockout.js kubernetes lasers lemp line scan cameras linux magento mercurial mesos microsoft mongodb mssql mysql netezza nginx node obj c oracle osx outlook perl phone php plc/mlc postgres postgresql power tools powershell puppet python rabbitmq rackspace rails rails.js rds react redis redshift redux rhel ruby s3 sails salesforce sass scikit-learn scss segment.io selenium server 2012 sinatra skype slack sql sql server ssh stack overflow sublime text surreal.cms swift teamcity teradata terminals torquebox ubuntu vagrant vb vba vero vim visual studio visual studio 2012 vmware windows windows batch files windows server wordpress wp xcode xslt
0 5 35 20 70000 1 True 4 0 False False 0 True True 8 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 10 40 0 120000 1 False 4 3000 False True 3 False False 6 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 11 45 15 75000 2 True 0 0 False False 2 True True 6 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
3 6 40 16 77000 2 True 0 0 False False 10 True True 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 6 50 14 70000 2 True 0 0 False False 3 False False 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [20]:
# Get the most mentioned stack tools by summing the values of the tools_matrix_df
# Results show we have a very webby crowd (or at the very least, lots of full-stack developers)
X = tools_matrix_df.sum()

X.sort(ascending=False)
X.head(10)


Out[20]:
javascript    39
linux         23
node          22
ruby          22
rails         19
postgres      19
nginx         16
php           15
mysql         14
angular       12
dtype: int64

Question 3: Are there commonalities in groups of responders that machine learning can sniff out on our behalf?

Cluster the survey responses together to see if there are common groups of responders. See what the characteristics of those groups are.


In [21]:
# Using a basic KMeans algorithm
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5, max_iter=500)

# Attempt 1: Just give it all the data in its raw, naked glory and see what it comes up with
kmeans.fit(survey_df_wTM)
kmeans.labels_


Out[21]:
array([0, 1, 0, 0, 0, 3, 0, 3, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 3, 0, 2,
       4, 0, 0, 0, 0, 0, 3, 3, 0, 1, 0, 0, 0, 0, 3, 1, 3, 0, 0, 1, 0, 0, 0,
       3, 0, 0, 0, 1, 3, 3, 3, 3, 3, 3, 1, 1, 0, 3, 1, 1, 0, 0, 3, 3], dtype=int32)

In [22]:
# Taking a look at what records were assigned to what cluster, I notice a disturbing trend: Salary is outweighing 
#   all other features
survey_df_c1 = survey_df_wTM.copy()
survey_df_c1.insert(0, 'CLUSTER1_LABEL', kmeans.labels_)
survey_df_c1.head(10)


Out[22]:
CLUSTER1_LABEL YEARS_EXP NUM_HOURS_WORK DAYS_OFF SALARY COMP_SIZE HEALTH_INS COMP_MEALS CONF_BUDGET 401K TRAINING_BUDGET BOOKS_READ DEGREE CAREER_DEGREE WORK_HIST_COUNT .net airflow android anger angular ansible apache apex asp .net asp mvc atom aws baby bottles & burping cloths. bash bootstrap brainpower c++ centos chairs classic asp clojure cognos computers consul csharp css css3 db2 for z/os db2 luw debian digitalocean divshot django docker drupal ec2 eclipse ide elasticache elasticsearch elixir ember es6 excel express express.js flask flux force.com platform fsharp giphy git github go google google docs google drive google hangouts gotomeeting graphite gulp hand tools hate heroku hipchat hl7 html html5 iis ims internet java javascript jekyll jenkins jira jquery jruby keras kibana knockout knockout.js kubernetes lasers lemp line scan cameras linux magento mercurial mesos microsoft mongodb mssql mysql netezza nginx node obj c oracle osx outlook perl phone php plc/mlc postgres postgresql power tools powershell puppet python rabbitmq rackspace rails rails.js rds react redis redshift redux rhel ruby s3 sails salesforce sass scikit-learn scss segment.io selenium server 2012 sinatra skype slack sql sql server ssh stack overflow sublime text surreal.cms swift teamcity teradata terminals torquebox ubuntu vagrant vb vba vero vim visual studio visual studio 2012 vmware windows windows batch files windows server wordpress wp xcode xslt
0 0 5 35 20 70000 1 True 4 0 False False 0 True True 8 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 1 10 40 0 120000 1 False 4 3000 False True 3 False False 6 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 0 11 45 15 75000 2 True 0 0 False False 2 True True 6 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
3 0 6 40 16 77000 2 True 0 0 False False 10 True True 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 6 50 14 70000 2 True 0 0 False False 3 False False 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 3 2 50 14 45000 1 False 0 0 False True 4 True False 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0
6 0 16 50 15 93550 3 True 0 350 True True 22 True True 9 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0
7 3 1 40 60 50000 1 False 0 0 True False 24 True False 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
8 1 18 50 30 130000 5 True 0 0 True False 10 False False 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9 1 30 20 20 120000 2 True 0 5000 True True 10 False False 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [23]:
survey_df_c1.tail(10)


Out[23]:
CLUSTER1_LABEL YEARS_EXP NUM_HOURS_WORK DAYS_OFF SALARY COMP_SIZE HEALTH_INS COMP_MEALS CONF_BUDGET 401K TRAINING_BUDGET BOOKS_READ DEGREE CAREER_DEGREE WORK_HIST_COUNT .net airflow android anger angular ansible apache apex asp .net asp mvc atom aws baby bottles & burping cloths. bash bootstrap brainpower c++ centos chairs classic asp clojure cognos computers consul csharp css css3 db2 for z/os db2 luw debian digitalocean divshot django docker drupal ec2 eclipse ide elasticache elasticsearch elixir ember es6 excel express express.js flask flux force.com platform fsharp giphy git github go google google docs google drive google hangouts gotomeeting graphite gulp hand tools hate heroku hipchat hl7 html html5 iis ims internet java javascript jekyll jenkins jira jquery jruby keras kibana knockout knockout.js kubernetes lasers lemp line scan cameras linux magento mercurial mesos microsoft mongodb mssql mysql netezza nginx node obj c oracle osx outlook perl phone php plc/mlc postgres postgresql power tools powershell puppet python rabbitmq rackspace rails rails.js rds react redis redshift redux rhel ruby s3 sails salesforce sass scikit-learn scss segment.io selenium server 2012 sinatra skype slack sql sql server ssh stack overflow sublime text surreal.cms swift teamcity teradata terminals torquebox ubuntu vagrant vb vba vero vim visual studio visual studio 2012 vmware windows windows batch files windows server wordpress wp xcode xslt
57 1 10.0 45 25 115000 4 True 4 0 True False 12 True True 4 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
58 1 10.0 40 25 120000 2 True 0 0 False False 3 False False 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
59 0 5.0 40 15 80000 2 True 0 0 False False 5 True True 6 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
60 3 2.0 40 20 65000 2 True 2 0 False False 10 False False 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
61 1 20.0 35 45 185000 5 True 0 20000 True True 15 True False 14 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
62 1 11.0 45 20 115000 3 True 4 5000 True True 20 True True 5 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
63 0 12.0 45 15 95000 2 True 2 0 True True 50 True True 6 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
64 0 7.0 30 0 96000 1 False 0 2000 False False 16 False False 4 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
65 3 0.5 40 12 40000 1 True 0 0 False False 4 True True 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
66 3 2.0 40 10 30000 2 False 0 0 False False 3 False False 2 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [24]:
# To prove that out a bit more I group by the cluster label and run some stats
cl1_groups = survey_df_c1.groupby(['CLUSTER1_LABEL'])
cl1_groups.agg([np.mean, np.min, np.max, np.std])


Out[24]:
YEARS_EXP NUM_HOURS_WORK DAYS_OFF SALARY COMP_SIZE HEALTH_INS COMP_MEALS CONF_BUDGET 401K TRAINING_BUDGET BOOKS_READ DEGREE CAREER_DEGREE WORK_HIST_COUNT .net airflow android anger angular ansible apache apex asp .net asp mvc ... ssh stack overflow sublime text surreal.cms swift teamcity teradata terminals torquebox ubuntu vagrant vb vba vero vim visual studio visual studio 2012 vmware windows windows batch files windows server wordpress wp xcode xslt
mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std ... mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std
CLUSTER1_LABEL
0 10.333333 1.0 30 7.030546 43.444444 30 60 6.591962 21.638889 0 100 20.077212 82586 68000 100000 9938.867506 2.333333 1 5 0.925820 0.916667 False True 0.280306 0.861111 0 8 1.709683 423.611111 0 5000 1008.711067 0.638889 False True 0.487136 0.305556 False True 0.467177 9.833333 0 50 12.190160 0.833333 False True 0.377964 0.694444 False True 0.467177 6.027778 1 27 4.462454 0.027778 0 1 0.166667 0.222222 0 1 0.421637 0.027778 0 1 0.166667 0.055556 0 1 0.232311 0.027778 0 1 0.166667 0.250000 0 1 0.439155 0.055556 0 1 0.232311 0.055556 0 1 0.232311 0.027778 0 1 0.166667 0.000000 0 0 0.000000 0.000000 0 0 0.000000 ... 0.027778 0 1 0.166667 0.055556 0 1 0.232311 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.027778 0 1 0.166667 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.027778 0 1 0.166667 0.000000 0 0 0.000000 0.166667 0 1 0.377964 0.055556 0 1 0.232311 0.027778 0 1 0.166667 0.027778 0 1 0.166667 0.027778 0 1 0.166667 0.027778 0 1 0.166667 0.027778 0 1 0.166667 0.000000 0 0 0.000000 0.027778 0 1 0.166667 0.027778 0 1 0.166667 0.027778 0 1 0.166667 0.027778 0 1 0.166667 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.027778 0 1 0.166667 0.000000 0 0 0.000000
1 12.833333 4.0 30 6.939129 39.583333 20 50 7.216878 22.666667 0 45 10.620164 126666 105000 185000 24058.010699 3.083333 1 5 1.443376 0.916667 False True 0.288675 2.083333 0 8 2.778434 3125.000000 0 20000 5657.356434 0.750000 False True 0.452267 0.583333 False True 0.514929 7.583333 1 20 5.869154 0.500000 False True 0.522233 0.416667 False True 0.514929 7.166667 3 20 4.969605 0.083333 0 1 0.288675 0.083333 0 1 0.288675 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.166667 0 1 0.389249 0.000000 0 0 0.000000 0.083333 0 1 0.288675 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 ... 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.083333 0 1 0.288675 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.083333 0 1 0.288675 0.083333 0 1 0.288675 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000
2 9.000000 9.0 9 NaN 40.000000 40 40 NaN 25.000000 25 25 NaN 90000 90000 90000 NaN 2.000000 2 2 NaN 1.000000 True True NaN 1.000000 1 1 NaN 200000.000000 200000 200000 NaN 0.000000 False False NaN 0.000000 False False NaN 8.000000 8 8 NaN 1.000000 True True NaN 1.000000 True True NaN 4.000000 4 4 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN ... 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN
3 5.147059 0.5 28 6.804789 44.705882 40 65 7.174301 18.235294 0 60 13.534813 50426 20000 65000 12621.832742 2.470588 1 5 1.419403 0.764706 False True 0.437237 1.411765 0 10 2.807553 147.058824 0 1500 424.437344 0.588235 False True 0.507300 0.352941 False True 0.492592 7.970588 2 40 9.811458 0.823529 False True 0.392953 0.588235 False True 0.507300 2.882353 1 6 1.615640 0.235294 0 1 0.437237 0.058824 0 1 0.242536 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.058824 0 1 0.242536 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.058824 0 1 0.242536 0.058824 0 1 0.242536 ... 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.058824 0 1 0.242536 0.058824 0 1 0.242536 0.000000 0 0 0.000000 0.058824 0 1 0.242536 0.058824 0 1 0.242536 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.176471 0 1 0.392953 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.058824 0 1 0.242536 0.000000 0 0 0.000000 0.058824 0 1 0.242536 0.000000 0 0 0.000000 0.058824 0 1 0.242536 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.176471 0 1 0.392953 0.058824 0 1 0.242536 0.000000 0 0 0.000000 0.058824 0 1 0.242536
4 1.000000 1.0 1 NaN 45.000000 45 45 NaN 25.000000 25 25 NaN 55000 55000 55000 NaN 2.000000 2 2 NaN 1.000000 True True NaN 9.000000 9 9 NaN 100000.000000 100000 100000 NaN 0.000000 False False NaN 1.000000 True True NaN 3.000000 3 3 NaN 1.000000 True True NaN 0.000000 False False NaN 1.000000 1 1 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN ... 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN 0.000000 0 0 NaN

5 rows × 700 columns


In [25]:
# But what if I wanted to stop salary from outweighing the other features so that the algorithm can consider ALL
#   features in its clustering decision?

def normalize(df, columns):
    result = df.copy()
    for feature_name in columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

In [26]:
# apply it to...
survey_df_normalized = normalize(survey_df_raw, ['YEARS_EXP', 'NUM_HOURS_WORK', 'DAYS_OFF', 'SALARY', 'CONF_BUDGET', 'BOOKS_READ', 'WORK_HIST_COUNT'])
survey_df_normalized.drop(['STACK', 'COMP_MEALS'], axis=1, inplace=True)
survey_df_normalized.head(10)


Out[26]:
YEARS_EXP NUM_HOURS_WORK DAYS_OFF SALARY COMP_SIZE HEALTH_INS CONF_BUDGET 401K TRAINING_BUDGET BOOKS_READ DEGREE CAREER_DEGREE WORK_HIST_COUNT
0 0.152542 0.333333 0.20 0.303030 1 True 0.00000 False False 0.00 True True 0.269231
1 0.322034 0.444444 0.00 0.606061 1 False 0.01500 False True 0.06 False False 0.192308
2 0.355932 0.555556 0.15 0.333333 2 True 0.00000 False False 0.04 True True 0.192308
3 0.186441 0.444444 0.16 0.345455 2 True 0.00000 False False 0.20 True True 0.076923
4 0.186441 0.666667 0.14 0.303030 2 True 0.00000 False False 0.06 False False 0.076923
5 0.050847 0.666667 0.14 0.151515 1 False 0.00000 False True 0.08 True False 0.000000
6 0.525424 0.666667 0.15 0.445758 3 True 0.00175 True True 0.44 True True 0.307692
7 0.016949 0.444444 0.60 0.181818 1 False 0.00000 True False 0.48 True False 0.153846
8 0.593220 0.666667 0.30 0.666667 5 True 0.00000 True False 0.20 False False 0.076923
9 1.000000 0.000000 0.20 0.606061 2 True 0.02500 True True 0.20 False False 0.730769

In [27]:
kmeans.fit(survey_df_normalized)

survey_df_c2 = survey_df_wTM.copy()
survey_df_c2.insert(0, 'CLUSTER2_LABEL', kmeans.labels_)
survey_df_c2.head(10)


Out[27]:
CLUSTER2_LABEL YEARS_EXP NUM_HOURS_WORK DAYS_OFF SALARY COMP_SIZE HEALTH_INS COMP_MEALS CONF_BUDGET 401K TRAINING_BUDGET BOOKS_READ DEGREE CAREER_DEGREE WORK_HIST_COUNT .net airflow android anger angular ansible apache apex asp .net asp mvc atom aws baby bottles & burping cloths. bash bootstrap brainpower c++ centos chairs classic asp clojure cognos computers consul csharp css css3 db2 for z/os db2 luw debian digitalocean divshot django docker drupal ec2 eclipse ide elasticache elasticsearch elixir ember es6 excel express express.js flask flux force.com platform fsharp giphy git github go google google docs google drive google hangouts gotomeeting graphite gulp hand tools hate heroku hipchat hl7 html html5 iis ims internet java javascript jekyll jenkins jira jquery jruby keras kibana knockout knockout.js kubernetes lasers lemp line scan cameras linux magento mercurial mesos microsoft mongodb mssql mysql netezza nginx node obj c oracle osx outlook perl phone php plc/mlc postgres postgresql power tools powershell puppet python rabbitmq rackspace rails rails.js rds react redis redshift redux rhel ruby s3 sails salesforce sass scikit-learn scss segment.io selenium server 2012 sinatra skype slack sql sql server ssh stack overflow sublime text surreal.cms swift teamcity teradata terminals torquebox ubuntu vagrant vb vba vero vim visual studio visual studio 2012 vmware windows windows batch files windows server wordpress wp xcode xslt
0 2 5 35 20 70000 1 True 4 0 False False 0 True True 8 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 3 10 40 0 120000 1 False 4 3000 False True 3 False False 6 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 2 11 45 15 75000 2 True 0 0 False False 2 True True 6 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
3 2 6 40 16 77000 2 True 0 0 False False 10 True True 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 0 6 50 14 70000 2 True 0 0 False False 3 False False 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 3 2 50 14 45000 1 False 0 0 False True 4 True False 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0
6 4 16 50 15 93550 3 True 0 350 True True 22 True True 9 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0
7 3 1 40 60 50000 1 False 0 0 True False 24 True False 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
8 1 18 50 30 130000 5 True 0 0 True False 10 False False 3 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9 0 30 20 20 120000 2 True 0 5000 True True 10 False False 20 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [28]:
# Fixed! But there is a new problem. I don't know what these clusters mean because there are too many dimensions
#   for me to wrap my tiny little brain around.

cl2_groups = survey_df_c2.groupby(['CLUSTER2_LABEL'])
cl2_groups.agg([np.mean, np.min, np.max, np.std])


Out[28]:
YEARS_EXP NUM_HOURS_WORK DAYS_OFF SALARY COMP_SIZE HEALTH_INS COMP_MEALS CONF_BUDGET 401K TRAINING_BUDGET BOOKS_READ DEGREE CAREER_DEGREE WORK_HIST_COUNT .net airflow android anger angular ansible apache apex asp .net asp mvc ... ssh stack overflow sublime text surreal.cms swift teamcity teradata terminals torquebox ubuntu vagrant vb vba vero vim visual studio visual studio 2012 vmware windows windows batch files windows server wordpress wp xcode xslt
mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std ... mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std mean amin amax std
CLUSTER2_LABEL
0 8.666667 1.0 30 7.715167 41.666667 20 65 9.385906 15.933333 5 25 6.005553 81133 30000 165000 32447.466696 2.000000 2 2 0.000000 0.933333 False True 0.258199 0.800000 0 9 2.336053 7100.000000 0 100000 25733.663323 0.400000 False True 0.507093 0.200000 False True 0.414039 6.600000 1 35 8.466742 0.266667 False True 0.457738 0.000000 False False 0.000000 6.866667 1 27 7.170044 0.000000 0 0 0.000000 0.133333 0 1 0.351866 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.133333 0 1 0.351866 0.066667 0 1 0.258199 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.066667 0 1 0.258199 0.066667 0 1 0.258199 ... 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.066667 0 1 0.258199 0.066667 0 1 0.258199 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.066667 0 1 0.258199 0.066667 0 1 0.258199 0.0 0 0 0.000000 0.0 0 0 0.000000 0.066667 0 1 0.258199 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.0 0 0 0.000000 0.066667 0 1 0.258199 0.0 0 0 0.000000 0.000000 0 0 0.000000 0.066667 0 1 0.258199 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000
1 13.500000 2.0 30 8.759610 43.357143 35 50 5.123207 30.928571 15 100 21.345020 94696 52000 185000 36481.314834 4.571429 4 5 0.513553 1.000000 True True 0.000000 2.571429 0 10 3.274947 1757.142857 0 20000 5303.368175 1.000000 True True 0.000000 0.714286 False True 0.468807 6.714286 0 15 4.664573 0.857143 False True 0.363137 0.714286 False True 0.468807 4.642857 1 14 3.177012 0.142857 0 1 0.363137 0.071429 0 1 0.267261 0.000000 0 0 0.000000 0.071429 0 1 0.267261 0.000000 0 0 0.000000 0.071429 0 1 0.267261 0.000000 0 0 0.000000 0.071429 0 1 0.267261 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 ... 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.071429 0 1 0.267261 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.0 0 0 0.000000 0.0 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.071429 0 1 0.267261 0.000000 0 0 0.000000 0.0 0 0 0.000000 0.000000 0 0 0.000000 0.0 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.071429 0 1 0.267261
2 6.522727 0.5 25 5.491286 42.272727 35 60 5.284135 21.090909 10 100 18.322271 73954 40000 115000 16737.403980 1.909091 1 2 0.294245 0.954545 False True 0.213201 0.727273 0 6 1.517574 9227.272727 0 200000 42612.240378 0.500000 False True 0.511766 0.000000 False False 0.000000 4.659091 0 20 4.658291 1.000000 True True 0.000000 1.000000 True True 0.000000 4.045455 1 9 2.590776 0.090909 0 1 0.294245 0.136364 0 1 0.351250 0.045455 0 1 0.213201 0.000000 0 0 0.000000 0.045455 0 1 0.213201 0.272727 0 1 0.455842 0.045455 0 1 0.213201 0.000000 0 0 0.000000 0.045455 0 1 0.213201 0.000000 0 0 0.000000 0.000000 0 0 0.000000 ... 0.045455 0 1 0.213201 0.045455 0 1 0.213201 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.045455 0 1 0.213201 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.045455 0 1 0.213201 0.045455 0 1 0.213201 0.272727 0 1 0.455842 0.045455 0 1 0.213201 0.0 0 0 0.000000 0.0 0 0 0.000000 0.000000 0 0 0.000000 0.045455 0 1 0.213201 0.000000 0 0 0.000000 0.045455 0 1 0.213201 0.0 0 0 0.000000 0.045455 0 1 0.213201 0.0 0 0 0.000000 0.045455 0 1 0.213201 0.045455 0 1 0.213201 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000
3 6.500000 1.0 18 6.715653 43.333333 30 60 10.327956 14.000000 0 60 23.323808 71833 20000 120000 38937.984873 1.166667 1 2 0.408248 0.000000 False False 0.000000 0.833333 0 4 1.602082 833.333333 0 3000 1329.159885 0.166667 False True 0.408248 0.333333 False True 0.516398 17.833333 3 40 13.775582 0.666667 False True 0.516398 0.000000 False False 0.000000 5.166667 1 10 2.926887 0.000000 0 0 0.000000 0.166667 0 1 0.408248 0.000000 0 0 0.000000 0.166667 0 1 0.408248 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.166667 0 1 0.408248 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 ... 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.166667 0 1 0.408248 0.166667 0 1 0.408248 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.333333 0 1 0.516398 0.000000 0 0 0.000000 0.0 0 0 0.000000 0.0 0 0 0.000000 0.000000 0 0 0.000000 0.166667 0 1 0.408248 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.0 0 0 0.000000 0.000000 0 0 0.000000 0.0 0 0 0.000000 0.000000 0 0 0.000000 0.166667 0 1 0.408248 0.166667 0 1 0.408248 0.166667 0 1 0.408248 0.000000 0 0 0.000000
4 12.200000 5.0 25 5.846176 46.200000 35 55 5.672546 19.100000 12 26 4.677369 89460 40000 115000 20256.518827 2.400000 2 3 0.516398 1.000000 True True 0.000000 2.100000 0 8 2.766867 1615.000000 0 5000 1954.488936 1.000000 True True 0.000000 1.000000 True True 0.000000 18.900000 1 50 16.702960 1.000000 True True 0.000000 0.900000 False True 0.316228 6.900000 3 10 2.424413 0.200000 0 1 0.421637 0.300000 0 1 0.483046 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.300000 0 1 0.483046 0.000000 0 0 0.000000 0.100000 0 1 0.316228 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 ... 0.000000 0 0 0.000000 0.100000 0 1 0.316228 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.100000 0 1 0.316228 0.000000 0 0 0.000000 0.1 0 1 0.316228 0.1 0 1 0.316228 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.1 0 1 0.316228 0.000000 0 0 0.000000 0.1 0 1 0.316228 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000 0.000000 0 0 0.000000

5 rows × 700 columns


In [29]:
# Since I'm looking for clusters of ... say devs that earn similar salaries and have similar numbers of paid holidays and 
#    have similar work histories, why don't I simply cluster on those 3 dimensions?

survey_df_selected = survey_df_normalized[['SALARY', 'DAYS_OFF', 'WORK_HIST_COUNT']]

kmeans = KMeans(n_clusters=5, max_iter=500)
kmeans.fit(survey_df_selected)
kmeans.labels_


Out[29]:
array([4, 4, 4, 1, 1, 1, 4, 3, 0, 2, 4, 1, 0, 1, 2, 4, 4, 1, 4, 1, 1, 4, 4,
       1, 1, 3, 1, 4, 1, 1, 1, 4, 4, 1, 4, 4, 1, 1, 4, 1, 4, 3, 0, 1, 1, 4,
       1, 1, 4, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 4, 1, 0, 0, 4, 4, 1, 1], dtype=int32)

In [30]:
survey_df_c3 = survey_df_raw.copy()[['SALARY', 'DAYS_OFF', 'WORK_HIST_COUNT']]
survey_df_c3.insert(0, 'CLUSTER3_LABEL', kmeans.labels_)
survey_df_c3.head(10)


Out[30]:
CLUSTER3_LABEL SALARY DAYS_OFF WORK_HIST_COUNT
0 4 70000 20 8
1 4 120000 0 6
2 4 75000 15 6
3 1 77000 16 3
4 1 70000 14 3
5 1 45000 14 1
6 4 93550 15 9
7 3 50000 60 5
8 0 130000 30 3
9 2 120000 20 20

In [31]:
cl3_groups = survey_df_c3.groupby(['CLUSTER3_LABEL'])
cl3_groups.agg([np.mean, np.min, np.max, np.std])


Out[31]:
SALARY DAYS_OFF WORK_HIST_COUNT
mean amin amax std mean amin amax std mean amin amax std
CLUSTER3_LABEL
0 133750 115000 185000 26558.560616 26.125000 15 45 9.109453 6.250000 3 14 3.535534
1 61976 20000 89000 15604.016150 16.781250 0 33 7.156023 2.843750 1 6 1.393750
2 105000 90000 120000 21213.203436 17.000000 14 20 4.242641 23.500000 20 27 4.949747
3 78500 50000 95000 24784.067463 86.666667 60 100 23.094011 5.666667 3 9 3.055050
4 90754 70000 120000 11685.011853 16.863636 0 26 7.278772 6.909091 4 10 1.949581

In [34]:
%matplotlib inline

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

survey_df_c3b = survey_df_normalized.copy()[['SALARY', 'DAYS_OFF', 'WORK_HIST_COUNT']]
survey_df_c3b.insert(0, 'CLUSTER3_LABEL', kmeans.labels_)
cl3b_groups = survey_df_c3b.groupby(['CLUSTER3_LABEL'])

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

ax.scatter(cl3b_groups.get_group(0)['SALARY'].tolist(), 
           cl3b_groups.get_group(0)['DAYS_OFF'].tolist(), 
           cl3b_groups.get_group(0)['WORK_HIST_COUNT'].tolist(), 
           c='r', marker='o')
ax.scatter(cl3b_groups.get_group(1)['SALARY'].tolist(), 
           cl3b_groups.get_group(1)['DAYS_OFF'].tolist(), 
           cl3b_groups.get_group(1)['WORK_HIST_COUNT'].tolist(), 
           c='b', marker='o')
ax.scatter(cl3b_groups.get_group(2)['SALARY'].tolist(), 
           cl3b_groups.get_group(2)['DAYS_OFF'].tolist(), 
           cl3b_groups.get_group(2)['WORK_HIST_COUNT'].tolist(), 
           c='y', marker='^')
ax.scatter(cl3b_groups.get_group(3)['SALARY'].tolist(), 
           cl3b_groups.get_group(3)['DAYS_OFF'].tolist(), 
           cl3b_groups.get_group(3)['WORK_HIST_COUNT'].tolist(), 
           c='g', marker='x')
ax.scatter(cl3b_groups.get_group(4)['SALARY'].tolist(), 
           cl3b_groups.get_group(4)['DAYS_OFF'].tolist(), 
           cl3b_groups.get_group(4)['WORK_HIST_COUNT'].tolist(), 
           c='w', marker='o')

ax.set_xlabel('SALARY')
ax.set_ylabel('DAYS_OFF')
ax.set_zlabel('WORK_HIST_COUNT')

plt.show()


/Users/timojo/anaconda/lib/python2.7/site-packages/matplotlib/collections.py:590: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison
  if self._edgecolors == str('face'):

Question 4: Of the features we have, what are the important features in this dataset when it comes to determining salary? How do I predict salary using a decision tree?


In [35]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz

y = survey_df_raw["SALARY"]
X = survey_df_raw.drop(["SALARY", "STACK"], axis=1, inplace=False)
tree = DecisionTreeClassifier(min_samples_split=4)
tree.fit(X, y)


Out[35]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [36]:
from os import system
from IPython.display import Image

with open('tree.dot', 'w') as dotfile:
    export_graphviz(tree, dotfile, feature_names=X.columns)

system("dot -Tpng tree.dot -o tree.png")
Image(filename='tree.png')


Out[36]:

In [ ]: