In [1]:
# Load needed modules and functions
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from pylab import figure, show
import pandas as pd
from pandas import DataFrame, Series
In [2]:
#set up path to the data files
import os
data_folder = os.path.join(os.pardir, "data")
In [3]:
import glob
file_names = glob.glob(data_folder + "/*")
In [4]:
import re
p = re.compile('data/(.*).txt')
name_list = []
for name in file_names:
frame_name = p.findall(name)[0]
frame_name = frame_name.lower().replace(" ","_")
frame_name = frame_name.replace(",","")
name_list.append(frame_name)
frame = pd.read_table(name, sep= '\t')
#reformat column names
columns = frame.columns
columns = [x.lower().replace("*","").replace("-","_").replace(" ","_") for x in columns]
frame.columns = columns
#create a variable named the frame_name
vars()[frame_name] = frame
In [ ]:
#function that calculates the number of features available in a dataframe (the # rows divided by # of jobs)
def feature(dataframe):
return len(dataframe)/len(dataframe.onet_soc_code.unique())
In [ ]:
#In abilities, we only want to keep the rows where scale_id == 'IM'
abilities_final = abilities[abilities.scale_id == 'IM']
In [ ]:
len(abilities_final)
In [ ]:
feature(abilities_final)
In [ ]:
#In knowledge, we only want to keep the rows where scale_id == 'IM'
knowledge_final = knowledge[knowledge.scale_id == 'IM']
In [ ]:
len(knowledge_final)
In [ ]:
feature(knowledge_final)
In [ ]:
#In interests, we only want to keep rows where scale_id == 'OI'
interests_final = interests[interests.scale_id == 'OI']
In [ ]:
len(interests_final)
In [ ]:
feature(interests_final)
In [ ]:
interests_final
In [ ]:
#we do not need to do anything to job_zones
job_zones_final = job_zones
In [ ]:
len(job_zones_final)
In [ ]:
feature(job_zones_final)
In [5]:
#for skills, we only want to keep rows where scale_id == "IM"
skills_final = skills[skills.scale_id == 'IM']
In [6]:
len(skills_final)
In [7]:
feature(skills_final)
In [8]:
#for work activities, we only want to keep rows where scale_id == 'IM'
work_activities_final = work_activities[work_activities.scale_id == 'IM']
In [9]:
len(work_activities_final)
In [10]:
feature(work_activities_final)
In [11]:
#in work context, we only want to keep rows where scale_id == 'CX' or 'CT'
work_context_final = work_context[(work_context['scale_id'] == 'CX') | (work_context['scale_id'] == 'CT')]
In [12]:
len(work_context_final)
In [13]:
feature(work_context_final)
In [14]:
#in work styles, we can keep everythin
work_styles_final = work_styles
In [15]:
len(work_styles_final)
In [16]:
feature(work_styles_final)
In [17]:
#in work values, we want to only keep rows where scale_id == 'EX'
work_values_final = work_values[work_values.scale_id == 'EX']
In [18]:
len(work_values_final)
In [19]:
feature(work_values_final)