Set up


In [1]:
# Load needed modules and functions
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

from pylab import figure, show

import pandas as pd
from pandas import DataFrame, Series
from sklearn.neighbors import NearestNeighbors

In [2]:
#set up path to the data files
import os
data_folder = os.path.join(os.pardir, "data")

In [3]:
import glob 
file_names = glob.glob(data_folder + "/*")
#file_names = glob.glob(data_folder + "\\\*")

In [4]:
import re
p = re.compile('data/(.*).txt')
#p = re.compile('data\\\(.*).txt')
name_list = []
for name in file_names:
    frame_name = p.findall(name)[0]
    frame_name = frame_name.lower().replace(" ","_")
    frame_name = frame_name.replace(",","")
    name_list.append(frame_name)
    frame = pd.read_table(name, sep= '\t')
    #reformat column names
    columns = frame.columns
    columns = [x.lower().replace("*","").replace("-","_").replace(" ","_") for x in columns]
    frame.columns = columns
    #create a variable named the frame_name
    vars()[frame_name] = frame


/Users/agswigart/anaconda/envs/myenv/lib/python2.7/site-packages/pandas/io/parsers.py:1070: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
  data = self._reader.read(nrows)

Functions


In [5]:
#function that calculates the number of features available in a dataframe (the # rows divided by # of jobs)
def feature(dataframe):
    return len(dataframe)/len(dataframe.onet_soc_code.unique())

Data Filtering

Abilities


In [6]:
#In abilities, we only want to keep the rows where scale_id == 'IM'
abilities_final = abilities[abilities.scale_id == 'IM']

In [7]:
len(abilities_final)


Out[7]:
47996

In [8]:
feature(abilities_final)


Out[8]:
52

Knowledge


In [9]:
#In knowledge, we only want to keep the rows where scale_id == 'IM'
knowledge_final = knowledge[knowledge.scale_id == 'IM']

In [10]:
len(knowledge_final)


Out[10]:
30459

In [11]:
feature(knowledge_final)


Out[11]:
33

Interests


In [12]:
#In interests, we only want to keep rows where scale_id == 'OI'
interests_final = interests[interests.scale_id == 'OI']

In [13]:
len(interests_final)


Out[13]:
5844

In [14]:
feature(interests_final)


Out[14]:
6

In [15]:
interests_final['domain'] = 'Interests'
interests_final.head()


Out[15]:
onet_soc_code element_id element_name scale_id data_value date domain_source domain
0 11-1011.00 1.B.1.a Realistic OI 1.33 06/2008 Analyst Interests
1 11-1011.00 1.B.1.b Investigative OI 2.00 06/2008 Analyst Interests
2 11-1011.00 1.B.1.c Artistic OI 2.67 06/2008 Analyst Interests
3 11-1011.00 1.B.1.d Social OI 3.67 06/2008 Analyst Interests
4 11-1011.00 1.B.1.e Enterprising OI 7.00 06/2008 Analyst Interests

5 rows × 8 columns


In [16]:
interests_pt = interests_final.pivot_table('data_value',
                                           rows = 'onet_soc_code',
                                           cols = ['domain', 'element_name'],
                                           aggfunc = 'sum')
interests_pt.head()


Out[16]:
domain Interests
element_name Artistic Conventional Enterprising Investigative Realistic Social
onet_soc_code
11-1011.00 2.67 5.33 7 2.00 1.33 3.67
11-1011.03 2.67 4.33 7 4.33 1.00 2.33
11-1021.00 1.00 3.67 7 1.33 1.33 3.33
11-1031.00 3.67 3.00 7 3.67 1.00 4.67
11-2011.00 5.33 4.67 7 2.00 1.67 2.33

5 rows × 6 columns

Job Zones


In [17]:
#we do not need to do anything to job_zones
job_zones_final = job_zones

In [18]:
len(job_zones_final)


Out[18]:
924

In [19]:
feature(job_zones_final)


Out[19]:
1

In [20]:
job_zones_final.head()


Out[20]:
onet_soc_code job_zone date domain_source
0 11-1011.00 5 06/2006 Analyst
1 11-1011.03 5 07/2013 Analyst
2 11-1021.00 3 06/2008 Analyst
3 11-1031.00 4 06/2008 Analyst
4 11-2011.00 4 06/2010 Analyst

5 rows × 4 columns


In [21]:
job_zones_final['domain'] = 'Job_Zones'
job_zones_final['element_name'] = 'job_zone'
job_zones_pt = job_zones_final.pivot_table('job_zone',
                                           rows = 'onet_soc_code',
                                           cols = ['domain', 'element_name'],
                                           aggfunc = 'sum')
job_zones_pt.head()


Out[21]:
domain Job_Zones
element_name job_zone
onet_soc_code
11-1011.00 5
11-1011.03 5
11-1021.00 3
11-1031.00 4
11-2011.00 4

5 rows × 1 columns

Skills


In [22]:
#for skills, we only want to keep rows where scale_id == "IM"
skills_final = skills[skills.scale_id == 'IM']

In [23]:
len(skills_final)


Out[23]:
32305

In [24]:
feature(skills_final)


Out[24]:
35

In [25]:
skills_final.head()


Out[25]:
onet_soc_code element_id element_name scale_id data_value n standard_error lower_ci_bound upper_ci_bound recommend_suppress not_relevant date domain_source
0 11-1011.00 2.A.1.a Reading Comprehension IM 4.38 8 0.18 4.02 4.73 N n/a 06/2010 Analyst
2 11-1011.00 2.A.1.b Active Listening IM 4.38 8 0.18 4.02 4.73 N n/a 06/2010 Analyst
4 11-1011.00 2.A.1.c Writing IM 4.12 8 0.23 3.68 4.57 N n/a 06/2010 Analyst
6 11-1011.00 2.A.1.d Speaking IM 4.38 8 0.18 4.02 4.73 N n/a 06/2010 Analyst
8 11-1011.00 2.A.1.e Mathematics IM 3.00 8 0.19 2.63 3.37 N n/a 06/2010 Analyst

5 rows × 13 columns


In [26]:
skills_final['domain'] = 'Skills'
skills_pt = skills_final.pivot_table('data_value',
                                     rows = 'onet_soc_code',
                                     cols = ['domain', 'element_name'],
                                     aggfunc = 'sum')
skills_pt.head()


Out[26]:
domain Skills
element_name Active Learning Active Listening Complex Problem Solving Coordination Critical Thinking Equipment Maintenance Equipment Selection Installation Instructing Judgment and Decision Making Learning Strategies Management of Financial Resources Management of Material Resources Management of Personnel Resources Mathematics Monitoring Negotiation Operation Monitoring Operation and Control Operations Analysis
onet_soc_code
11-1011.00 4.00 4.38 4.50 4.25 4.38 1 1.00 1.00 3.25 4.50 3.38 4.12 3.62 4.25 3.00 4.12 4.00 2.25 1.88 3.50 ...
11-1011.03 3.50 3.88 4.00 3.62 4.00 1 1.12 1.00 3.25 3.75 3.38 2.62 2.38 3.38 2.75 3.62 2.88 2.25 1.62 2.62 ...
11-1021.00 3.50 4.00 3.50 3.62 3.88 1 1.25 1.12 3.12 3.50 2.75 2.88 3.12 3.38 2.25 3.75 3.50 2.88 2.12 3.25 ...
11-2011.00 3.25 4.00 3.50 3.50 3.75 1 1.25 1.00 2.88 3.75 2.75 2.88 2.50 3.12 3.00 3.25 3.38 1.62 1.25 2.88 ...
11-2021.00 3.50 3.88 3.38 3.50 3.88 1 1.00 1.00 3.12 3.62 3.00 2.88 2.38 3.38 2.75 3.62 3.25 2.00 1.00 3.50 ...

5 rows × 35 columns

Work Activities


In [27]:
#for work activities, we only want to keep rows where scale_id == 'IM'
work_activities_final = work_activities[work_activities.scale_id == 'IM']

In [28]:
len(work_activities_final)


Out[28]:
37843

In [29]:
feature(work_activities_final)


Out[29]:
41

In [30]:
work_activities_final['domain'] = 'Work_Activities'
work_activities_pt = work_activities_final.pivot_table('data_value',
                                     rows = 'onet_soc_code',
                                     cols = ['domain', 'element_name'],
                                     aggfunc = 'sum')
work_activities_pt.head()


Out[30]:
domain Work_Activities
element_name Analyzing Data or Information Assisting and Caring for Others Coaching and Developing Others Communicating with Persons Outside Organization Communicating with Supervisors, Peers, or Subordinates Controlling Machines and Processes Coordinating the Work and Activities of Others Developing Objectives and Strategies Developing and Building Teams Documenting/Recording Information Drafting, Laying Out, and Specifying Technical Devices, Parts, and Equipment Establishing and Maintaining Interpersonal Relationships Estimating the Quantifiable Characteristics of Products, Events, or Information Evaluating Information to Determine Compliance with Standards Getting Information Guiding, Directing, and Motivating Subordinates Handling and Moving Objects Identifying Objects, Actions, and Events Inspecting Equipment, Structures, or Material Interacting With Computers
onet_soc_code
11-1011.00 4.19 2.22 3.91 4.62 4.75 1.32 4.00 4.63 4.55 2.19 1.22 4.64 2.68 3.50 4.75 4.02 1.48 3.64 1.49 3.28 ...
11-1011.03 3.85 2.23 3.64 4.46 4.58 1.36 3.96 4.31 4.12 3.44 2.56 4.28 3.31 4.12 4.48 3.88 1.62 3.62 2.36 4.32 ...
11-1021.00 3.49 3.08 3.41 3.83 3.74 1.99 4.09 3.22 3.56 3.29 2.70 3.82 3.36 3.54 4.37 3.48 2.53 3.65 3.36 3.68 ...
11-2011.00 2.81 2.10 2.68 4.56 4.28 2.22 3.06 3.68 3.27 3.30 1.70 4.04 3.07 2.50 4.38 2.72 1.86 3.36 2.13 4.08 ...
11-2021.00 3.52 2.40 3.54 4.60 4.58 1.32 3.96 4.04 4.24 2.84 1.92 4.40 3.16 2.84 4.52 3.60 1.52 3.84 1.84 4.08 ...

5 rows × 41 columns

Work Context


In [31]:
#in work context, we only want to keep rows where scale_id == 'CX' or 'CT'
work_context_final = work_context[(work_context['scale_id'] == 'CX') | (work_context['scale_id'] == 'CT')]

In [32]:
len(work_context_final)


Out[32]:
52592

In [33]:
feature(work_context_final)


Out[33]:
56

In [34]:
work_context_final_CX = work_context_final[work_context_final['scale_id'] == 'CX']
work_context_final_CT = work_context_final[work_context_final['scale_id'] == 'CT']

In [35]:
work_context_final_CX['domain'] = 'Work_Context'
work_context_CX_pt = work_context_final_CX.pivot_table('data_value',
                                     rows = 'onet_soc_code',
                                     cols = ['domain', 'element_name'],
                                     aggfunc = 'sum')
work_context_CX_pt.head()


Out[35]:
domain Work_Context
element_name Consequence of Error Contact With Others Coordinate or Lead Others Cramped Work Space, Awkward Positions Deal With External Customers Deal With Physically Aggressive People Deal With Unpleasant or Angry People Degree of Automation Electronic Mail Exposed to Contaminants Exposed to Disease or Infections Exposed to Hazardous Conditions Exposed to Hazardous Equipment Exposed to High Places Exposed to Minor Burns, Cuts, Bites, or Stings Exposed to Radiation Exposed to Whole Body Vibration Extremely Bright or Inadequate Lighting Face-to-Face Discussions Freedom to Make Decisions
onet_soc_code
11-1011.00 3.55 4.84 4.32 1.47 3.83 2.07 3.92 1.80 5.00 1.49 1.01 1.47 1.49 1.32 1.34 1.00 1.44 1.35 5.00 4.92 ...
11-1011.03 2.35 4.38 4.12 1.38 3.73 1.04 2.38 1.72 4.96 1.65 1.12 1.23 1.17 1.50 1.15 1.08 1.08 1.42 4.76 4.42 ...
11-1021.00 3.04 4.76 4.20 1.32 4.48 1.60 3.39 2.32 4.26 2.11 1.68 1.40 1.68 1.76 2.19 1.01 1.15 1.79 4.60 4.80 ...
11-2011.00 2.06 4.65 4.12 1.53 3.89 1.29 2.73 2.56 5.00 1.12 1.17 1.03 1.11 1.12 1.31 1.00 1.03 1.39 4.56 4.15 ...
11-2021.00 2.40 4.64 3.72 1.21 4.00 1.12 2.56 2.08 5.00 1.16 1.00 1.04 1.12 1.17 1.04 1.00 1.00 1.20 4.84 4.20 ...

5 rows × 55 columns


In [36]:
work_context_final_CT['domain'] = 'Work_Context_Time'
work_context_CT_pt = work_context_final_CT.pivot_table('data_value',
                                     rows = 'onet_soc_code',
                                     cols = ['domain', 'element_name'],
                                     aggfunc = 'sum')
work_context_CT_pt.head()


Out[36]:
domain Work_Context_Time
element_name Duration of Typical Work Week Work Schedules
onet_soc_code
11-1011.00 2.91 1.00
11-1011.03 2.77 1.35
11-1021.00 2.67 1.37
11-2011.00 2.51 1.04
11-2021.00 2.68 1.28

5 rows × 2 columns

Work Styles


In [37]:
#in work styles, we can keep everythin
work_styles_final = work_styles

In [38]:
len(work_styles_final)


Out[38]:
14752

In [39]:
feature(work_styles_final)


Out[39]:
16

In [40]:
work_styles_final['domain'] = 'Work_Styles'
work_styles_pt = work_styles_final.pivot_table('data_value',
                                     rows = 'onet_soc_code',
                                     cols = ['domain', 'element_name'],
                                     aggfunc = 'sum')
work_styles_pt.head()


Out[40]:
domain Work_Styles
element_name Achievement/Effort Adaptability/Flexibility Analytical Thinking Attention to Detail Concern for Others Cooperation Dependability Independence Initiative Innovation Integrity Leadership Persistence Self Control Social Orientation Stress Tolerance
onet_soc_code
11-1011.00 4.66 4.48 4.24 4.26 3.95 4.42 4.67 4.63 4.79 4.22 4.85 4.84 4.61 4.28 4.02 4.75
11-1011.03 4.19 4.23 4.31 4.12 3.48 4.32 4.23 4.27 4.60 4.38 4.58 4.64 4.31 4.00 3.35 4.08
11-1021.00 4.07 4.21 4.22 4.52 3.96 4.26 4.73 3.96 4.36 3.88 4.36 4.50 4.24 4.38 3.56 4.35
11-2011.00 4.30 4.54 4.16 4.70 3.93 4.40 4.74 4.08 4.71 4.51 4.66 4.23 4.23 4.42 3.99 4.39
11-2021.00 4.24 4.24 3.84 4.48 3.72 4.44 4.56 4.20 4.32 4.08 4.40 4.36 4.28 4.04 3.88 4.20

5 rows × 16 columns

Work Values


In [41]:
#in work values, we want to only keep rows where scale_id == 'EX'
work_values_final = work_values[work_values.scale_id == 'EX']

In [42]:
len(work_values_final)


Out[42]:
5844

In [43]:
feature(work_values_final)


Out[43]:
6

In [44]:
work_values_final['domain'] = 'Work_Values'
work_values_pt = work_values_final.pivot_table('data_value',
                                     rows = 'onet_soc_code',
                                     cols = ['domain', 'element_name'],
                                     aggfunc = 'sum')
work_values_pt.head()


Out[44]:
domain Work_Values
element_name Achievement Independence Recognition Relationships Support Working Conditions
onet_soc_code
11-1011.00 6.33 7.00 7.00 5.00 5.33 6.33
11-1011.03 6.67 6.67 6.00 5.00 3.33 6.33
11-1021.00 5.33 6.00 5.67 6.33 4.67 6.00
11-1031.00 5.33 5.00 5.00 5.67 4.00 4.33
11-2011.00 5.33 5.33 5.33 5.00 4.00 5.33

5 rows × 6 columns


In [45]:
occupation_data['element_name'] = "title"
occupation_data['domain'] = 'Occupation'
occ_data_pt = occupation_data.pivot_table('title',
                                     rows = 'onet_soc_code',
                                     cols = ['domain', 'element_name'],
                                     aggfunc = 'sum')


#combined_df = combined_df.rename(columns=lambda x: x.replace(' ', '_'))
occ_data_pt.Occupation.title = occ_data_pt.Occupation.title.apply(lambda x: x.replace(' ', '_'))
occ_data_pt.Occupation.title = occ_data_pt.Occupation.title.apply(lambda x: x.replace('/', '_'))
occ_data_pt.Occupation.title = occ_data_pt.Occupation.title.apply(lambda x: x.replace(',', '_'))


occ_data_pt.tail()
len(set(occ_data_pt.index))


/Users/agswigart/anaconda/envs/myenv/lib/python2.7/site-packages/pandas/core/generic.py:1830: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_index,col_indexer] = value instead
  self[name] = value
Out[45]:
1110

In [46]:
domain_pt_list = [interests_pt, job_zones_pt, skills_pt, work_activities_pt, work_context_CX_pt, work_context_CT_pt, work_styles_pt, work_values_pt]

combined_df = pd.concat(domain_pt_list, axis=1)

combined_df = pd.merge(occ_data_pt, combined_df, left_index = True, right_index = True)

combined_df.head()


Out[46]:
domain Occupation Interests Job_Zones Skills
element_name title Artistic Conventional Enterprising Investigative Realistic Social job_zone Active Learning Active Listening Complex Problem Solving Coordination Critical Thinking Equipment Maintenance Equipment Selection Installation Instructing Judgment and Decision Making Learning Strategies Management of Financial Resources
11-1011.00 Chief_Executives 2.67 5.33 7 2.00 1.33 3.67 5 4.00 4.38 4.5 4.25 4.38 1 1.00 1.00 3.25 4.50 3.38 4.12 ...
11-1011.03 Chief_Sustainability_Officers 2.67 4.33 7 4.33 1.00 2.33 5 3.50 3.88 4.0 3.62 4.00 1 1.12 1.00 3.25 3.75 3.38 2.62 ...
11-1021.00 General_and_Operations_Managers 1.00 3.67 7 1.33 1.33 3.33 3 3.50 4.00 3.5 3.62 3.88 1 1.25 1.12 3.12 3.50 2.75 2.88 ...
11-1031.00 Legislators 3.67 3.00 7 3.67 1.00 4.67 4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ...
11-2011.00 Advertising_and_Promotions_Managers 5.33 4.67 7 2.00 1.67 2.33 4 3.25 4.00 3.5 3.50 3.75 1 1.25 1.00 2.88 3.75 2.75 2.88 ...

5 rows × 163 columns


In [47]:
# Remove spaces in element names
combined_df = combined_df.rename(columns=lambda x: x.replace(' ', '_'))

# combined_df.fillna(0, inplace=True)

In [48]:
combined_df.head()


Out[48]:
domain Occupation Interests Job_Zones Skills
element_name title Artistic Conventional Enterprising Investigative Realistic Social job_zone Active_Learning Active_Listening Complex_Problem_Solving Coordination Critical_Thinking Equipment_Maintenance Equipment_Selection Installation Instructing Judgment_and_Decision_Making Learning_Strategies Management_of_Financial_Resources
11-1011.00 Chief_Executives 2.67 5.33 7 2.00 1.33 3.67 5 4.00 4.38 4.5 4.25 4.38 1 1.00 1.00 3.25 4.50 3.38 4.12 ...
11-1011.03 Chief_Sustainability_Officers 2.67 4.33 7 4.33 1.00 2.33 5 3.50 3.88 4.0 3.62 4.00 1 1.12 1.00 3.25 3.75 3.38 2.62 ...
11-1021.00 General_and_Operations_Managers 1.00 3.67 7 1.33 1.33 3.33 3 3.50 4.00 3.5 3.62 3.88 1 1.25 1.12 3.12 3.50 2.75 2.88 ...
11-1031.00 Legislators 3.67 3.00 7 3.67 1.00 4.67 4 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ...
11-2011.00 Advertising_and_Promotions_Managers 5.33 4.67 7 2.00 1.67 2.33 4 3.25 4.00 3.5 3.50 3.75 1 1.25 1.00 2.88 3.75 2.75 2.88 ...

5 rows × 163 columns


In [49]:
def normalize(series):
    maximum = series.max()
    minimum = series.min()
    return [(item - minimum) / (maximum - minimum) for item in series]

In [50]:
normed_df = combined_df.copy()
normed_df.iloc[:,1:] = normed_df.iloc[:,1:].apply(normalize)
normed_df.head()


Out[50]:
domain Occupation Interests Job_Zones Skills
element_name title Artistic Conventional Enterprising Investigative Realistic Social job_zone Active_Learning Active_Listening Complex_Problem_Solving Coordination Critical_Thinking Equipment_Maintenance Equipment_Selection Installation Instructing Judgment_and_Decision_Making Learning_Strategies Management_of_Financial_Resources
11-1011.00 Chief_Executives 0.278333 0.721667 1 0.166667 0.055000 0.445000 1.00 0.949367 0.800 1.0 1.000000 0.852 0 0.000000 0.000000 0.621547 1.0 0.693252 1.000000 ...
11-1011.03 Chief_Sustainability_Officers 0.278333 0.555000 1 0.555000 0.000000 0.221667 1.00 0.738397 0.600 0.8 0.720000 0.700 0 0.040000 0.000000 0.621547 0.7 0.693252 0.519231 ...
11-1021.00 General_and_Operations_Managers 0.000000 0.445000 1 0.055000 0.055000 0.388333 0.50 0.738397 0.648 0.6 0.720000 0.652 0 0.083333 0.038462 0.585635 0.6 0.500000 0.602564 ...
11-1031.00 Legislators 0.445000 0.333333 1 0.445000 0.000000 0.611667 0.75 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ...
11-2011.00 Advertising_and_Promotions_Managers 0.721667 0.611667 1 0.166667 0.111667 0.221667 0.75 0.632911 0.648 0.6 0.666667 0.600 0 0.083333 0.000000 0.519337 0.7 0.500000 0.602564 ...

5 rows × 163 columns

Visualizing the Features


In [51]:
from math import floor,ceil
def draw_histogram(domain_frame):
    fig, axes = plt.subplots(nrows=int((ceil(float(len(domain_frame.columns))/3.0))), ncols=3, figsize = (12,len(domain_frame.columns)))
    plt.subplots_adjust(hspace = 0.4)
    for i,column_name in enumerate(domain_frame.columns):
        row = int(floor(i/3))
        column = i % 3
        domain_frame[column_name].hist(bins=10, ax=axes[row,column]); axes[row,column].set_title(column_name); axes[row,column].set_ylim([0,500])

In [52]:
# draw_histogram(combined_df.Job_Zones)
# normed_df.Interests.Investigative.hist(bins=10)
draw_histogram(normed_df.Interests)



In [53]:
draw_histogram(normed_df.Skills)



In [54]:
draw_histogram(normed_df.Work_Activities)



In [55]:
draw_histogram(normed_df.Work_Context)



In [56]:
draw_histogram(normed_df.Work_Context_Time)


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-56-ab67bc215f7f> in <module>()
----> 1 draw_histogram(normed_df.Work_Context_Time)

<ipython-input-51-e444ff717fe3> in draw_histogram(domain_frame)
      6         row = int(floor(i/3))
      7         column = i % 3
----> 8         domain_frame[column_name].hist(bins=10, ax=axes[row,column]); axes[row,column].set_title(column_name); axes[row,column].set_ylim([0,500])

IndexError: too many indices

In [57]:
draw_histogram(normed_df.Work_Styles)



In [58]:
draw_histogram(normed_df.Work_Values)


Correlation between Features


In [59]:
corr_df = normed_df.iloc[:,1:].corr()

In [60]:
corr_df.index = corr_df.index.droplevel(0)
corr_df.head()


Out[60]:
domain Interests Job_Zones Skills
element_name Artistic Conventional Enterprising Investigative Realistic Social job_zone Active_Learning Active_Listening Complex_Problem_Solving Coordination Critical_Thinking Equipment_Maintenance Equipment_Selection Installation Instructing Judgment_and_Decision_Making Learning_Strategies Management_of_Financial_Resources Management_of_Material_Resources
element_name
Artistic 1.000000 -0.401212 0.012268 0.201901 -0.388635 0.311364 0.427659 0.360623 0.292641 0.245882 0.188951 0.240592 -0.339151 -0.250277 -0.179429 0.314698 0.259137 0.340943 0.084552 0.085508 ...
Conventional -0.401212 1.000000 0.267182 -0.149493 -0.137886 -0.239040 -0.177788 -0.163082 -0.000384 -0.126095 -0.092104 -0.112272 -0.098427 -0.161965 -0.064627 -0.253891 -0.144256 -0.230510 0.052377 -0.044017 ...
Enterprising 0.012268 0.267182 1.000000 -0.311084 -0.550015 0.191897 0.139388 0.237040 0.446623 0.199139 0.506449 0.266822 -0.420746 -0.455691 -0.310320 0.184611 0.304429 0.169215 0.412286 0.261025 ...
Investigative 0.201901 -0.149493 -0.311084 1.000000 -0.050547 0.061720 0.643559 0.572187 0.360374 0.617767 0.114631 0.588151 -0.064329 0.040445 0.045447 0.382763 0.495419 0.439133 0.151122 0.167033 ...
Realistic -0.388635 -0.137886 -0.550015 -0.050547 1.000000 -0.569939 -0.536989 -0.500476 -0.664910 -0.345526 -0.425227 -0.435737 0.663616 0.683996 0.421077 -0.419841 -0.409970 -0.453541 -0.203513 -0.025298 ...

5 rows × 162 columns


In [61]:
corr_df.columns = corr_df.columns.droplevel(0)
corr_df.head()


Out[61]:
element_name Artistic Conventional Enterprising Investigative Realistic Social job_zone Active_Learning Active_Listening Complex_Problem_Solving Coordination Critical_Thinking Equipment_Maintenance Equipment_Selection Installation Instructing Judgment_and_Decision_Making Learning_Strategies Management_of_Financial_Resources Management_of_Material_Resources
element_name
Artistic 1.000000 -0.401212 0.012268 0.201901 -0.388635 0.311364 0.427659 0.360623 0.292641 0.245882 0.188951 0.240592 -0.339151 -0.250277 -0.179429 0.314698 0.259137 0.340943 0.084552 0.085508 ...
Conventional -0.401212 1.000000 0.267182 -0.149493 -0.137886 -0.239040 -0.177788 -0.163082 -0.000384 -0.126095 -0.092104 -0.112272 -0.098427 -0.161965 -0.064627 -0.253891 -0.144256 -0.230510 0.052377 -0.044017 ...
Enterprising 0.012268 0.267182 1.000000 -0.311084 -0.550015 0.191897 0.139388 0.237040 0.446623 0.199139 0.506449 0.266822 -0.420746 -0.455691 -0.310320 0.184611 0.304429 0.169215 0.412286 0.261025 ...
Investigative 0.201901 -0.149493 -0.311084 1.000000 -0.050547 0.061720 0.643559 0.572187 0.360374 0.617767 0.114631 0.588151 -0.064329 0.040445 0.045447 0.382763 0.495419 0.439133 0.151122 0.167033 ...
Realistic -0.388635 -0.137886 -0.550015 -0.050547 1.000000 -0.569939 -0.536989 -0.500476 -0.664910 -0.345526 -0.425227 -0.435737 0.663616 0.683996 0.421077 -0.419841 -0.409970 -0.453541 -0.203513 -0.025298 ...

5 rows × 162 columns


In [62]:
corr_pairs_list = []
for i in range(len(corr_df.index)):
    row_name = corr_df.index[i]
    for j in range(i + 1, len(corr_df.columns)):
        column_name = corr_df.columns[j]
        corr_pairs_list.append([row_name,column_name, corr_df.ix[i,j]])

In [63]:
corr_pairs_df = DataFrame(corr_pairs_list)

In [64]:
corr_pairs_df.sort(2)


Out[64]:
0 1 2
12446 Spend_Time_Sitting Spend_Time_Standing -0.967267
12448 Spend_Time_Sitting Spend_Time_Walking_and_Running -0.835168
12265 Spend_Time_Bending_or_Twisting_the_Body Spend_Time_Sitting -0.795219
8911 Performing_General_Physical_Activities Spend_Time_Sitting -0.782129
10586 Electronic_Mail Spend_Time_Bending_or_Twisting_the_Body -0.773786
7753 Handling_and_Moving_Objects Spend_Time_Sitting -0.751062
5861 Writing Spend_Time_Bending_or_Twisting_the_Body -0.717948
8051 Interacting_With_Computers Spend_Time_Bending_or_Twisting_the_Body -0.717089
4486 Reading_Comprehension Spend_Time_Bending_or_Twisting_the_Body -0.713159
7717 Handling_and_Moving_Objects Electronic_Mail -0.711717
10990 Exposed_to_Minor_Burns,_Cuts,_Bites,_or_Stings Spend_Time_Sitting -0.711399
10593 Electronic_Mail Spend_Time_Using_Your_Hands_to_Handle,_Control... -0.705413
12376 Spend_Time_Kneeling,_Crouching,_Stooping,_or_C... Spend_Time_Sitting -0.703470
10592 Electronic_Mail Spend_Time_Standing -0.699675
668 Realistic Speaking -0.697539
5868 Writing Spend_Time_Using_Your_Hands_to_Handle,_Control... -0.695551
11738 Indoors,_Environmentally_Controlled Very_Hot_or_Cold_Temperatures -0.688135
5133 Speaking Spend_Time_Using_Your_Hands_to_Handle,_Control... -0.687152
5797 Writing Handling_and_Moving_Objects -0.683033
11098 Exposed_to_Whole_Body_Vibration Indoors,_Environmentally_Controlled -0.680449
4422 Reading_Comprehension Handling_and_Moving_Objects -0.670271
10561 Electronic_Mail Exposed_to_Minor_Burns,_Cuts,_Bites,_or_Stings -0.667508
11663 In_an_Open_Vehicle_or_Equipment Indoors,_Environmentally_Controlled -0.667292
4493 Reading_Comprehension Spend_Time_Using_Your_Hands_to_Handle,_Control... -0.665916
641 Realistic Active_Listening -0.664910
8618 Operating_Vehicles,_Mechanized_Devices,_or_Equ... Indoors,_Environmentally_Controlled -0.660575
7687 Handling_and_Moving_Objects Interacting_With_Computers -0.660367
8057 Interacting_With_Computers Spend_Time_Standing -0.659657
12340 Spend_Time_Keeping_or_Regaining_Balance Spend_Time_Sitting -0.655549
1380 Active_Listening Spend_Time_Using_Your_Hands_to_Handle,_Control... -0.652968
8875 Performing_General_Physical_Activities Electronic_Mail -0.650777
7999 Interacting_With_Computers Performing_General_Physical_Activities -0.650413
6325 Communicating_with_Persons_Outside_Organization Pace_Determined_by_Speed_of_Equipment -0.650114
1066 job_zone Spend_Time_Bending_or_Twisting_the_Body -0.644585
10660 Exposed_to_Contaminants Spend_Time_Sitting -0.644513
11832 Letters_and_Memos Spend_Time_Using_Your_Hands_to_Handle,_Control... -0.643299
10844 Exposed_to_Hazardous_Equipment Indoors,_Environmentally_Controlled -0.641733
10594 Electronic_Mail Spend_Time_Walking_and_Running -0.639322
674 Realistic Writing -0.638714
686 Realistic Establishing_and_Maintaining_Interpersonal_Rel... -0.634594
8026 Interacting_With_Computers Exposed_to_Minor_Burns,_Cuts,_Bites,_or_Stings -0.629980
10556 Electronic_Mail Exposed_to_Contaminants -0.629270
5126 Speaking Spend_Time_Bending_or_Twisting_the_Body -0.627783
12296 Spend_Time_Bending_or_Twisting_the_Body Recognition -0.626270
1373 Active_Listening Spend_Time_Bending_or_Twisting_the_Body -0.623239
5867 Writing Spend_Time_Standing -0.621393
663 Realistic Reading_Comprehension -0.620490
5062 Speaking Handling_and_Moving_Objects -0.619313
12453 Spend_Time_Sitting Wear_Common_Protective_or_Safety_Equipment_suc... -0.617690
4434 Reading_Comprehension Performing_General_Physical_Activities -0.617219
8053 Interacting_With_Computers Spend_Time_Keeping_or_Regaining_Balance -0.616856
10973 Exposed_to_Minor_Burns,_Cuts,_Bites,_or_Stings Indoors,_Environmentally_Controlled -0.616442
1002 job_zone Handling_and_Moving_Objects -0.616243
4492 Reading_Comprehension Spend_Time_Standing -0.615121
11825 Letters_and_Memos Spend_Time_Bending_or_Twisting_the_Body -0.615008
5836 Writing Exposed_to_Minor_Burns,_Cuts,_Bites,_or_Stings -0.614475
12294 Spend_Time_Bending_or_Twisting_the_Body Achievement -0.614046
5051 Speaking Controlling_Machines_and_Processes -0.613475
4097 Persuasion Spend_Time_Using_Your_Hands_to_Handle,_Control... -0.612960
5809 Writing Performing_General_Physical_Activities -0.611421
... ... ...

13041 rows × 3 columns


In [65]:
corr_pairs_df.sort(2, ascending=False)


Out[65]:
0 1 2
1884 Equipment_Maintenance Repairing 0.976774
5166 Systems_Analysis Systems_Evaluation 0.943016
13027 Achievement Recognition 0.919637
4405 Reading_Comprehension Writing 0.916252
2032 Equipment_Selection Repairing 0.909113
1866 Equipment_Maintenance Equipment_Selection 0.905916
3588 Operation_Monitoring Operation_and_Control 0.901321
2311 Instructing Learning_Strategies 0.896822
10922 Exposed_to_High_Places Spend_Time_Climbing_Ladders,_Scaffolds,_or_Poles 0.894912
1928 Equipment_Maintenance Repairing_and_Maintaining_Mechanical_Equipment 0.892552
13037 Recognition Working_Conditions 0.889523
7696 Handling_and_Moving_Objects Performing_General_Physical_Activities 0.888681
4569 Repairing Repairing_and_Maintaining_Mechanical_Equipment 0.887249
1286 Active_Listening Speaking 0.886562
13030 Achievement Working_Conditions 0.885633
3453 Negotiation Persuasion 0.884341
1414 Complex_Problem_Solving Critical_Thinking 0.876100
1419 Complex_Problem_Solving Judgment_and_Decision_Making 0.873171
2745 Management_of_Financial_Resources Management_of_Material_Resources 0.868843
3603 Operation_Monitoring Troubleshooting 0.865452
1720 Critical_Thinking Judgment_and_Decision_Making 0.863481
1893 Equipment_Maintenance Troubleshooting 0.863022
4273 Quality_Control_Analysis Troubleshooting 0.861049
4534 Repairing Troubleshooting 0.860817
1109 Active_Learning Critical_Thinking 0.858779
2041 Equipment_Selection Troubleshooting 0.857719
3746 Operation_and_Control Controlling_Machines_and_Processes 0.857358
1440 Complex_Problem_Solving Systems_Evaluation 0.853486
6601 Coordinating_the_Work_and_Activities_of_Others Developing_and_Building_Teams 0.848110
12821 Achievement/Effort Persistence 0.847364
5930 Analyzing_Data_or_Information Processing_Information 0.844796
2076 Equipment_Selection Repairing_and_Maintaining_Mechanical_Equipment 0.844682
12953 Initiative Persistence 0.841958
12481 Spend_Time_Standing Spend_Time_Walking_and_Running 0.841053
1114 Active_Learning Judgment_and_Decision_Making 0.840695
6513 Controlling_Machines_and_Processes Repairing_and_Maintaining_Mechanical_Equipment 0.839191
5694 Troubleshooting Repairing_and_Maintaining_Mechanical_Equipment 0.837864
13026 Achievement Independence 0.837742
1107 Active_Learning Complex_Problem_Solving 0.837309
13034 Independence Working_Conditions 0.835889
1439 Complex_Problem_Solving Systems_Analysis 0.835821
1100 job_zone Achievement 0.835461
8020 Interacting_With_Computers Electronic_Mail 0.835297
1115 Active_Learning Learning_Strategies 0.835023
11913 Outdoors,_Exposed_to_Weather Outdoors,_Under_Cover 0.834886
1102 job_zone Recognition 0.832792
5830 Writing Electronic_Mail 0.831127
11388 Frequency_of_Decision_Making Impact_of_Decisions_on_Co-workers_or_Company_R... 0.830485
4455 Reading_Comprehension Electronic_Mail 0.829915
1865 Critical_Thinking Working_Conditions 0.829546
12897 Concern_for_Others Social_Orientation 0.828631
12263 Spend_Time_Bending_or_Twisting_the_Body Spend_Time_Kneeling,_Crouching,_Stooping,_or_C... 0.824567
1862 Critical_Thinking Recognition 0.824394
1256 Active_Learning Recognition 0.823616
6174 Coaching_and_Developing_Others Training_and_Teaching_Others 0.823240
11615 In_an_Enclosed_Vehicle_or_Equipment Outdoors,_Exposed_to_Weather 0.822343
13031 Independence Recognition 0.821812
8906 Performing_General_Physical_Activities Spend_Time_Bending_or_Twisting_the_Body 0.821150
1139 Active_Learning Writing 0.821125
2476 Judgment_and_Decision_Making Systems_Evaluation 0.820151
... ... ...

13041 rows × 3 columns


In [ ]:
normed_df_subset = normed_df.domain[['Interests', 'Skills', 'Knowledge']]
#corr_df_2 = normed_df.iloc[:,1:].corr()