Set up


In [1]:
# Load needed modules and functions
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np

from pylab import figure, show

import pandas as pd
from pandas import DataFrame, Series

In [2]:
#set up path to the data files
import os
data_folder = os.path.join(os.pardir, "data")

In [3]:
import glob 
file_names = glob.glob(data_folder + "/*")

In [4]:
import re
p = re.compile('data/(.*).txt')
name_list = []
for name in file_names:
    frame_name = p.findall(name)[0]
    frame_name = frame_name.lower().replace(" ","_")
    frame_name = frame_name.replace(",","")
    name_list.append(frame_name)
    frame = pd.read_table(name, sep= '\t')
    #reformat column names
    columns = frame.columns
    columns = [x.lower().replace("*","").replace("-","_").replace(" ","_") for x in columns]
    frame.columns = columns
    #create a variable named the frame_name
    vars()[frame_name] = frame


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-4-bbc820e6adb5> in <module>()
      3 name_list = []
      4 for name in file_names:
----> 5     frame_name = p.findall(name)[0]
      6     frame_name = frame_name.lower().replace(" ","_")
      7     frame_name = frame_name.replace(",","")

IndexError: list index out of range

Functions


In [ ]:
#function that calculates the number of features available in a dataframe (the # rows divided by # of jobs)
def feature(dataframe):
    return len(dataframe)/len(dataframe.onet_soc_code.unique())

Data Filtering

Abilities


In [ ]:
#In abilities, we only want to keep the rows where scale_id == 'IM'
abilities_final = abilities[abilities.scale_id == 'IM']

In [ ]:
len(abilities_final)

In [ ]:
feature(abilities_final)

Knowledge


In [ ]:
#In knowledge, we only want to keep the rows where scale_id == 'IM'
knowledge_final = knowledge[knowledge.scale_id == 'IM']

In [ ]:
len(knowledge_final)

In [ ]:
feature(knowledge_final)

Interests


In [ ]:
#In interests, we only want to keep rows where scale_id == 'OI'
interests_final = interests[interests.scale_id == 'OI']

In [ ]:
len(interests_final)

In [ ]:
feature(interests_final)

In [ ]:
interests_final

Job Zones


In [ ]:
#we do not need to do anything to job_zones
job_zones_final = job_zones

In [ ]:
len(job_zones_final)

In [ ]:
feature(job_zones_final)

Skills


In [5]:
#for skills, we only want to keep rows where scale_id == "IM"
skills_final = skills[skills.scale_id == 'IM']


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-5-58d1a8d44d54> in <module>()
      1 #for skills, we only want to keep rows where scale_id == "IM"
----> 2 skills_final = skills[skills.scale_id == 'IM']

NameError: name 'skills' is not defined

In [6]:
len(skills_final)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-a6484ed612b4> in <module>()
----> 1 len(skills_final)

NameError: name 'skills_final' is not defined

In [7]:
feature(skills_final)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-b3f25301411c> in <module>()
----> 1 feature(skills_final)

NameError: name 'feature' is not defined

Work Activities


In [8]:
#for work activities, we only want to keep rows where scale_id == 'IM'
work_activities_final = work_activities[work_activities.scale_id == 'IM']


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-1b32b88c5ca8> in <module>()
      1 #for work activities, we only want to keep rows where scale_id == 'IM'
----> 2 work_activities_final = work_activities[work_activities.scale_id == 'IM']

NameError: name 'work_activities' is not defined

In [9]:
len(work_activities_final)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-9-14154da5c0a2> in <module>()
----> 1 len(work_activities_final)

NameError: name 'work_activities_final' is not defined

In [10]:
feature(work_activities_final)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-10-ced00d5ac589> in <module>()
----> 1 feature(work_activities_final)

NameError: name 'feature' is not defined

Work Context


In [11]:
#in work context, we only want to keep rows where scale_id == 'CX' or 'CT'
work_context_final = work_context[(work_context['scale_id'] == 'CX') | (work_context['scale_id'] == 'CT')]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-10a9f43a2c34> in <module>()
      1 #in work context, we only want to keep rows where scale_id == 'CX' or 'CT'
----> 2 work_context_final = work_context[(work_context['scale_id'] == 'CX') | (work_context['scale_id'] == 'CT')]

NameError: name 'work_context' is not defined

In [12]:
len(work_context_final)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-12-cc1caeaf0afc> in <module>()
----> 1 len(work_context_final)

NameError: name 'work_context_final' is not defined

In [13]:
feature(work_context_final)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-13-61d074899dfa> in <module>()
----> 1 feature(work_context_final)

NameError: name 'feature' is not defined

Work Styles


In [14]:
#in work styles, we can keep everythin
work_styles_final = work_styles


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-14-da78afb8261e> in <module>()
      1 #in work styles, we can keep everythin
----> 2 work_styles_final = work_styles

NameError: name 'work_styles' is not defined

In [15]:
len(work_styles_final)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-15-e155dfeae0e4> in <module>()
----> 1 len(work_styles_final)

NameError: name 'work_styles_final' is not defined

In [16]:
feature(work_styles_final)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-16-56b5040c172a> in <module>()
----> 1 feature(work_styles_final)

NameError: name 'feature' is not defined

Work Values


In [17]:
#in work values, we want to only keep rows where scale_id == 'EX'
work_values_final = work_values[work_values.scale_id == 'EX']


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-4d6f7c1d0dad> in <module>()
      1 #in work values, we want to only keep rows where scale_id == 'EX'
----> 2 work_values_final = work_values[work_values.scale_id == 'EX']

NameError: name 'work_values' is not defined

In [18]:
len(work_values_final)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-18-ee9882c22738> in <module>()
----> 1 len(work_values_final)

NameError: name 'work_values_final' is not defined

In [19]:
feature(work_values_final)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-19-231aa5e62b47> in <module>()
----> 1 feature(work_values_final)

NameError: name 'feature' is not defined