In [1]:
# Load needed modules and functions
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from pylab import figure, show
import pandas as pd
from pandas import DataFrame, Series
from sklearn.neighbors import NearestNeighbors
In [2]:
#set up path to the data files
import os
data_folder = os.path.join(os.pardir, "data")
In [3]:
import glob
file_names = glob.glob(data_folder + "/*")
#file_names = glob.glob(data_folder + "\\\*")
In [4]:
import re
p = re.compile('data/(.*).txt')
#p = re.compile('data\\\(.*).txt')
name_list = []
for name in file_names:
frame_name = p.findall(name)[0]
frame_name = frame_name.lower().replace(" ","_")
frame_name = frame_name.replace(",","")
name_list.append(frame_name)
frame = pd.read_table(name, sep= '\t')
#reformat column names
columns = frame.columns
columns = [x.lower().replace("*","").replace("-","_").replace(" ","_") for x in columns]
frame.columns = columns
#create a variable named the frame_name
vars()[frame_name] = frame
In [5]:
#function that calculates the number of features available in a dataframe (the # rows divided by # of jobs)
def feature(dataframe):
return len(dataframe)/len(dataframe.onet_soc_code.unique())
In [6]:
#In abilities, we only want to keep the rows where scale_id == 'IM'
abilities_final = abilities[abilities.scale_id == 'IM']
In [7]:
len(abilities_final)
Out[7]:
In [8]:
feature(abilities_final)
Out[8]:
In [9]:
#In knowledge, we only want to keep the rows where scale_id == 'IM'
knowledge_final = knowledge[knowledge.scale_id == 'IM']
In [10]:
len(knowledge_final)
Out[10]:
In [11]:
feature(knowledge_final)
Out[11]:
In [12]:
#In interests, we only want to keep rows where scale_id == 'OI'
interests_final = interests[interests.scale_id == 'OI']
In [13]:
len(interests_final)
Out[13]:
In [14]:
feature(interests_final)
Out[14]:
In [15]:
interests_final['domain'] = 'Interests'
interests_final.head()
Out[15]:
In [16]:
interests_pt = interests_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
interests_pt.head()
Out[16]:
In [17]:
#we do not need to do anything to job_zones
job_zones_final = job_zones
In [18]:
len(job_zones_final)
Out[18]:
In [19]:
feature(job_zones_final)
Out[19]:
In [20]:
job_zones_final.head()
Out[20]:
In [21]:
job_zones_final['domain'] = 'Job_Zones'
job_zones_final['element_name'] = 'job_zone'
job_zones_pt = job_zones_final.pivot_table('job_zone',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
job_zones_pt.head()
Out[21]:
In [22]:
#for skills, we only want to keep rows where scale_id == "IM"
skills_final = skills[skills.scale_id == 'IM']
In [23]:
len(skills_final)
Out[23]:
In [24]:
feature(skills_final)
Out[24]:
In [25]:
skills_final.head()
Out[25]:
In [26]:
skills_final['domain'] = 'Skills'
skills_pt = skills_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
skills_pt.head()
Out[26]:
In [27]:
#for work activities, we only want to keep rows where scale_id == 'IM'
work_activities_final = work_activities[work_activities.scale_id == 'IM']
In [28]:
len(work_activities_final)
Out[28]:
In [29]:
feature(work_activities_final)
Out[29]:
In [30]:
work_activities_final['domain'] = 'Work_Activities'
work_activities_pt = work_activities_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
work_activities_pt.head()
Out[30]:
In [31]:
#in work context, we only want to keep rows where scale_id == 'CX' or 'CT'
work_context_final = work_context[(work_context['scale_id'] == 'CX') | (work_context['scale_id'] == 'CT')]
In [32]:
len(work_context_final)
Out[32]:
In [33]:
feature(work_context_final)
Out[33]:
In [34]:
work_context_final_CX = work_context_final[work_context_final['scale_id'] == 'CX']
work_context_final_CT = work_context_final[work_context_final['scale_id'] == 'CT']
In [35]:
work_context_final_CX['domain'] = 'Work_Context'
work_context_CX_pt = work_context_final_CX.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
work_context_CX_pt.head()
Out[35]:
In [36]:
work_context_final_CT['domain'] = 'Work_Context_Time'
work_context_CT_pt = work_context_final_CT.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
work_context_CT_pt.head()
Out[36]:
In [37]:
#in work styles, we can keep everythin
work_styles_final = work_styles
In [38]:
len(work_styles_final)
Out[38]:
In [39]:
feature(work_styles_final)
Out[39]:
In [40]:
work_styles_final['domain'] = 'Work_Styles'
work_styles_pt = work_styles_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
work_styles_pt.head()
Out[40]:
In [41]:
#in work values, we want to only keep rows where scale_id == 'EX'
work_values_final = work_values[work_values.scale_id == 'EX']
In [42]:
len(work_values_final)
Out[42]:
In [43]:
feature(work_values_final)
Out[43]:
In [44]:
work_values_final['domain'] = 'Work_Values'
work_values_pt = work_values_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
work_values_pt.head()
Out[44]:
In [45]:
occupation_data['element_name'] = "title"
occupation_data['domain'] = 'Occupation'
occ_data_pt = occupation_data.pivot_table('title',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
#combined_df = combined_df.rename(columns=lambda x: x.replace(' ', '_'))
occ_data_pt.Occupation.title = occ_data_pt.Occupation.title.apply(lambda x: x.replace(' ', '_'))
occ_data_pt.Occupation.title = occ_data_pt.Occupation.title.apply(lambda x: x.replace('/', '_'))
occ_data_pt.Occupation.title = occ_data_pt.Occupation.title.apply(lambda x: x.replace(',', '_'))
occ_data_pt.tail()
len(set(occ_data_pt.index))
Out[45]:
In [46]:
domain_pt_list = [interests_pt, job_zones_pt, skills_pt, work_activities_pt, work_context_CX_pt, work_context_CT_pt, work_styles_pt, work_values_pt]
combined_df = pd.concat(domain_pt_list, axis=1)
combined_df = pd.merge(occ_data_pt, combined_df, left_index = True, right_index = True)
combined_df.head()
Out[46]:
In [47]:
# Remove spaces in element names
combined_df = combined_df.rename(columns=lambda x: x.replace(' ', '_'))
# combined_df.fillna(0, inplace=True)
In [48]:
combined_df.head()
Out[48]:
In [49]:
def normalize(series):
maximum = series.max()
minimum = series.min()
return [(item - minimum) / (maximum - minimum) for item in series]
In [50]:
normed_df = combined_df.copy()
normed_df.iloc[:,1:] = normed_df.iloc[:,1:].apply(normalize)
normed_df.head()
Out[50]:
Visualizing the Features
In [51]:
from math import floor,ceil
def draw_histogram(domain_frame):
fig, axes = plt.subplots(nrows=int((ceil(float(len(domain_frame.columns))/3.0))), ncols=3, figsize = (12,len(domain_frame.columns)))
plt.subplots_adjust(hspace = 0.4)
for i,column_name in enumerate(domain_frame.columns):
row = int(floor(i/3))
column = i % 3
domain_frame[column_name].hist(bins=10, ax=axes[row,column]); axes[row,column].set_title(column_name); axes[row,column].set_ylim([0,500])
In [52]:
# draw_histogram(combined_df.Job_Zones)
# normed_df.Interests.Investigative.hist(bins=10)
draw_histogram(normed_df.Interests)
In [53]:
draw_histogram(normed_df.Skills)
In [54]:
draw_histogram(normed_df.Work_Activities)
In [55]:
draw_histogram(normed_df.Work_Context)
In [56]:
draw_histogram(normed_df.Work_Context_Time)
In [57]:
draw_histogram(normed_df.Work_Styles)
In [58]:
draw_histogram(normed_df.Work_Values)
Correlation between Features
In [59]:
corr_df = normed_df.iloc[:,1:].corr()
In [60]:
corr_df.index = corr_df.index.droplevel(0)
corr_df.head()
Out[60]:
In [61]:
corr_df.columns = corr_df.columns.droplevel(0)
corr_df.head()
Out[61]:
In [62]:
corr_pairs_list = []
for i in range(len(corr_df.index)):
row_name = corr_df.index[i]
for j in range(i + 1, len(corr_df.columns)):
column_name = corr_df.columns[j]
corr_pairs_list.append([row_name,column_name, corr_df.ix[i,j]])
In [63]:
corr_pairs_df = DataFrame(corr_pairs_list)
In [64]:
corr_pairs_df.sort(2)
Out[64]:
In [65]:
corr_pairs_df.sort(2, ascending=False)
Out[65]:
In [ ]:
normed_df_subset = normed_df.domain[['Interests', 'Skills', 'Knowledge']]
#corr_df_2 = normed_df.iloc[:,1:].corr()