This is an introductory exploratory notebook that we used to investigate the text files, figure out which ones we wanted to use, and figure out which variables from them we wanted to use. It doesn't need to be run to understand the project.
There are some groupby statements that are commented out because they take too long to run.
In [1]:
# Load needed modules and functions
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from pylab import figure, show
import pandas as pd
from pandas import DataFrame, Series
In [2]:
#set up path to the data files
import os
data_folder = os.path.join(os.pardir, "data")
In [3]:
import glob
file_names = glob.glob(data_folder + "/*")
file_names
Out[3]:
In [4]:
#you'll probably need to modify p below if you are not on windows
import re
p = re.compile('data\\\(.*).txt')
name_list = []
for name in file_names:
frame_name = p.findall(name)[0]
frame_name = frame_name.lower().replace(" ","_")
frame_name = frame_name.replace(",","")
name_list.append(frame_name)
frame = pd.read_table(name, sep= '\t')
#reformat column names
columns = frame.columns
columns = [x.lower().replace("*","").replace("-","_").replace(" ","_") for x in columns]
frame.columns = columns
#create a variable named the frame_name
vars()[frame_name] = frame
#print file_name
#name_list.append(p.findall(name)[0])
In [5]:
#here is a data frame with all of the data frames we now have
name_list
Out[5]:
In [6]:
#create a dictionary that contains all of the dataframe column names, and the number of times they occur
from collections import Counter
column_names = Counter()
for name in name_list:
data = vars()[name]
for column in data.columns:
column_names[column]+=1
column_names
Out[6]:
In [7]:
#function that calculates the number of features available in a dataframe (the # rows divided by # of jobs)
def feature(dataframe):
return len(dataframe)/len(dataframe.onet_soc_code.unique())
In [8]:
#function that gets unique values of a dataframe column and merges it with another data frame
def getDescriptions(data, metadata, column_name):
uniques = pd.DataFrame(data[column_name].unique())
uniques.columns = [column_name]
return pd.merge(uniques,metadata,on=column_name)
In [9]:
#function to calculate the percentage of rows in an onet data table are relevant to the given job
def getRelevance(dataframe):
relevant_rows = dataframe[dataframe['not_relevant']!= 'Y']
relevance = float(len(relevant_rows))/float(len(dataframe))
return relevance*100
In [10]:
#function to calculate how many rows are recommended for exclusion in an onet data table
def getExclusions(dataframe):
excluded_rows = dataframe[dataframe['recommend_suppress'] == 'Y']
exclusions = float(len(excluded_rows))/float(len(dataframe))
return exclusions * 100
In this section, we examine the domain-level dataframes. These are
* abilities
* education_training
* knowledge
* interests
* job zones
* skills
* work activites
* work context
* work styles
* work values
In [11]:
abilities.head()
Out[11]:
In [12]:
# abilities.groupby(['onet_soc_code','element_id','element_name','scale_id']).apply(sum)
In [13]:
#what are the unique element names?
abilities.element_id.unique()
Out[13]:
In [14]:
#how many different ability element names are there?
len(abilities.element_name.unique())
Out[14]:
In [15]:
#what are the scales of each ability?
getDescriptions(abilities,scales_reference,"scale_id")
Out[15]:
In [16]:
getDescriptions(abilities, content_model_reference, "element_name")
Out[16]:
In [17]:
#how many abilities features are there?
feature(abilities)
Out[17]:
In [18]:
#percentage of relevant ability rows?
getRelevance(abilities)
Out[18]:
In [19]:
#percentage of rows to be excluded
getExclusions(abilities)
Out[19]:
In [20]:
job_elements = abilities[['onet_soc_code','element_id']][abilities.recommend_suppress == 'Y']
frame = pd.merge(job_elements, abilities, on=['onet_soc_code','element_id'])
frame = frame[frame.scale_id == 'IM']
frame.data_value.max()
frame[frame.data_value == 4.25]
Out[20]:
In [21]:
#domain data set #2- what doe the education, training, and experience data look like
#it has onet_soc_code-element_id/_name-scale_id-category
# education_training_and_experience.groupby(['onet_soc_code','element_id','element_name','scale_id',"category"]).apply(sum)
In [22]:
#what are the unique element_names and what do they mean?
getDescriptions(education_training_and_experience,content_model_reference,"element_name")
Out[22]:
In [23]:
#what are the unique scales in the education training data and what do they mean?
getDescriptions(education_training_and_experience, scales_reference, "scale_id")
#looks like there is a one-to-one relationship between the element names and the
Out[23]:
In [24]:
#what are the different categories in the education training data and what do they mean?
getDescriptions(education_training_and_experience, education_training_and_experience_categories, "category")
#meaning of category is dependent on the scale_id/element-name/element-id
Out[24]:
In [25]:
#how many education and training features are there
feature(education_training_and_experience)
#len(education_training_and_experience)/len(education_training_and_experience.onet_soc_code.unique())
Out[25]:
In [26]:
#what percentage of rows are relevant to the job?
#getRelevance(education_training_and_experience)
#this throws an error because there is no relevance column- everything is relevant
In [27]:
#percentage recommended suppressed
getExclusions(education_training_and_experience)
Out[27]:
In [28]:
#what does it look like?
knowledge.head()
Out[28]:
In [29]:
knowledge.element_name.unique()
Out[29]:
In [30]:
#what does it look like grouped by the factors?
# knowledge.groupby(['onet_soc_code','element_id','element_name','scale_id']).apply(sum)
#like abilities, it's grouped by onet_soc_code-element_id/name-scale_id
In [31]:
#what are the unique element_names and what do they mean?
getDescriptions(knowledge, content_model_reference, "element_name")
Out[31]:
In [32]:
#what are the different knowledge scales and what do they mean?
getDescriptions(knowledge, scales_reference, "scale_id")
#these are the same as for ability
Out[32]:
In [33]:
#how many different knowledge features are there?
feature(knowledge)
Out[33]:
In [34]:
#percentage relevant
getRelevance(knowledge)
Out[34]:
In [35]:
#percent to be excluded
getExclusions(knowledge)
Out[35]:
In [36]:
#job_elements = knowledge[['onet_soc_code','element_id']][abilities.recommend_suppress == 'Y
test = knowledge[knowledge.recommend_suppress == 'Y']
test[test.scale_id == "IM"]
# frame = pd.merge(job_elements, abilities, on=['onet_soc_code','element_id'])
# frame = frame[frame.scale_id == 'IM']
# frame.data_value.max()
# frame[frame.data_value == 4.25]
Out[36]:
In [37]:
#what does it look like?
interests.head()
Out[37]:
In [38]:
#what does it look like grouped by the factors?
# interests.groupby(['onet_soc_code','element_id','element_name','scale_id']).apply(sum)
#looks like one-to-one matching between element_name and scale_id
In [39]:
#what do these element names mean?
getDescriptions(interests, content_model_reference, "element_name")
Out[39]:
In [40]:
#what do the scale_ids mean?
getDescriptions(interests, scales_reference, "scale_id")
Out[40]:
In [41]:
#how many total interests features are there?
feature(interests)
Out[41]:
In [42]:
#What do the job zones look like?
job_zones.head()
#there's a one-to-one relationship between jobs and job_zone, so we don't need to group_by
Out[42]:
In [43]:
#what is a job zone?
getDescriptions(job_zones, job_zone_reference, "job_zone")
#these seem to be very closely related, simplified version of the education training information
Out[43]:
In [44]:
#how many features are in the job zone data
feature(job_zones)
Out[44]:
In [45]:
#what do the skills look like?
skills.head()
Out[45]:
In [46]:
#what do the skills look like grouped by factor
# skills.groupby(['onet_soc_code','element_id','element_name','scale_id']).apply(sum)
In [47]:
#what are the different element names?
getDescriptions(skills, content_model_reference, "element_name")
Out[47]:
In [48]:
#What do the different scales mean?
getDescriptions(skills, scales_reference, "scale_id")
#they are the same skills as in abilities
Out[48]:
In [49]:
#how many skills features are there?
feature(skills)
Out[49]:
In [50]:
#what fraction of the skill combinations are relevant to the job
getRelevance(skills)
Out[50]:
In [51]:
#what percentage of skill combinations are recommended to be excluded?
getExclusions(skills)
Out[51]:
In [52]:
test = skills[skills.recommend_suppress == 'Y']
test[test.scale_id == "IM"]
Out[52]:
In [53]:
#what does it look like?
work_activities.head()
Out[53]:
In [54]:
#grouped by the factors
# work_activities.groupby(['onet_soc_code','element_id','element_name','scale_id']).apply(sum)
#grouped same as most of the other data frames- same scale_id
In [55]:
#what do each of the elements mean?
getDescriptions(work_activities, content_model_reference, "element_name")
Out[55]:
In [56]:
#don't need to do scale- it's the same as abilities and a bunch of other data frames- importance and level
In [57]:
#how many total features are there?
feature(work_activities)
Out[57]:
In [58]:
#percentage of rows that are relevant
getRelevance(work_activities)
Out[58]:
In [59]:
#percentage of rows that should be excluded
getExclusions(work_activities)
Out[59]:
In [60]:
test = work_activities[work_activities.recommend_suppress == 'Y']
test[test.scale_id == 'IM']
Out[60]:
In [61]:
work_context.head()
Out[61]:
In [62]:
#group by the factors
# work_context.groupby(['onet_soc_code','element_id','element_name','scale_id','category']).apply(sum)
In [63]:
len(work_context.element_name.unique())
Out[63]:
In [64]:
work_context[work_context.scale_id == "CT"]
Out[64]:
In [65]:
#what are the scales?
getDescriptions(work_context, scales_reference, "scale_id")
Out[65]:
In [66]:
#what are the categories? These definitions are in the work_context_categories dataframe
#need to change work_context.category from dtype object to dtype int
work_context.category.replace("n/a","0", inplace=True)
In [67]:
work_context.category = work_context.category.astype(int)
In [68]:
categories_desc = getDescriptions(work_context, work_context_categories, "category")
categories_desc[categories_desc.scale_id == "CTP"]
#looks like they are dependent on the element name/id and the scale, let's groupby
Out[68]:
In [69]:
#let's group the category description to figure out what's going on
categories_desc.groupby(['element_name','scale_id','category']).apply(sum)
Out[69]:
In [70]:
#how many features are in work context?
feature(work_context)
Out[70]:
In [71]:
#percent that are relevant
getRelevance(work_context)
Out[71]:
In [72]:
#percent to be excluded
getExclusions(work_context)
Out[72]:
In [73]:
work_styles.head()
Out[73]:
In [74]:
# work_styles.groupby(['onet_soc_code','element_id','element_name','scale_id']).apply(sum)
In [75]:
#what are the elements
getDescriptions(work_styles, content_model_reference, "element_name")
Out[75]:
In [76]:
#what are the different scales?
getDescriptions(work_styles, scales_reference, "scale_id")
Out[76]:
In [77]:
#how many features are there?
feature(work_styles)
Out[77]:
In [78]:
#what percentage should be excluded?
getExclusions(work_styles)
Out[78]:
In [79]:
work_values.head()
Out[79]:
In [80]:
# work_values.groupby(['onet_soc_code','element_id','element_name','scale_id']).apply(sum)
In [81]:
#what are the different element names?
getDescriptions(work_values, content_model_reference, "element_name")
Out[81]:
In [82]:
#get scales
getDescriptions(work_values, scales_reference, "scale_id")
Out[82]:
In [83]:
#what are the number of features
feature(work_values)
Out[83]: