This notebook performs hierarchical clustering and other distance-based analysis on a data set of occupations and their features. It has the following sections:
Data Source: O*NET, the Occupational Information Network from the US Department of Labor.
In [1]:
# Load needed modules and functions
import matplotlib.pyplot as plt
%matplotlib inline
import os
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import sklearn
from sklearn.neighbors import NearestNeighbors
from pylab import figure, show
In [2]:
#set up path to the data files
data_folder = os.path.join(os.pardir, "data")
In [3]:
file_names = ['Abilities.txt','Interests.txt','Job Zones.txt', 'Knowledge.txt','Occupation Data.txt','Skills.txt','Work Activities.txt','Work Context Categories.txt','Work Context.txt','Work Styles.txt','Work Values.txt']
In [4]:
#read in each of the files into a dataframe
name_list = []
for name in file_names:
frame_name = name.replace('.txt','').lower().replace(" ","_").replace(",","")
name_list.append(frame_name)
frame = pd.read_table(data_folder + '/' + name, sep= '\t')
#reformat column names
frame.columns = [x.lower().replace("*","").replace("-","_").replace(" ","_") for x in frame.columns]
#create a variable named the frame_name that contains the data
vars()[frame_name] = frame
In [5]:
#function that calculates the number of features available in a dataframe (the # rows divided by # of jobs)
def feature(dataframe):
return len(dataframe)/len(dataframe.onet_soc_code.unique())
In [6]:
#In abilities, we only want to keep the rows where scale_id == 'IM'
abilities_final = abilities[abilities.scale_id == 'IM']
In [7]:
len(abilities_final)
Out[7]:
In [8]:
feature(abilities_final)
Out[8]:
In [9]:
abilities_final['domain'] = 'Abilities'
abilities_final.head()
Out[9]:
In [10]:
abilities_pt = abilities_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
abilities_pt.head()
Out[10]:
In [11]:
#In knowledge, we only want to keep the rows where scale_id == 'IM'
knowledge_final = knowledge[knowledge.scale_id == 'IM']
In [12]:
len(knowledge_final)
Out[12]:
In [13]:
feature(knowledge_final)
Out[13]:
In [14]:
knowledge_final['domain'] = 'Knowledge'
knowledge_final.head()
Out[14]:
In [15]:
knowledge_pt = knowledge_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
knowledge_pt.head()
Out[15]:
In [16]:
#In interests, we only want to keep rows where scale_id == 'OI'
interests_final = interests[interests.scale_id == 'OI']
In [17]:
len(interests_final)
Out[17]:
In [18]:
feature(interests_final)
Out[18]:
In [19]:
interests_final['domain'] = 'Interests'
interests_final.head()
Out[19]:
In [20]:
interests_pt = interests_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
interests_pt.head()
Out[20]:
In [21]:
#we do not need to do anything to job_zones
job_zones_final = job_zones
In [22]:
len(job_zones_final)
Out[22]:
In [23]:
feature(job_zones_final)
Out[23]:
In [24]:
job_zones_final.head()
Out[24]:
In [25]:
job_zones_final['domain'] = 'Job_Zones'
job_zones_final['element_name'] = 'job_zone'
job_zones_pt = job_zones_final.pivot_table('job_zone',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
job_zones_pt.head()
Out[25]:
In [26]:
#for skills, we only want to keep rows where scale_id == "IM"
skills_final = skills[skills.scale_id == 'IM']
In [27]:
len(skills_final)
Out[27]:
In [28]:
feature(skills_final)
Out[28]:
In [29]:
skills_final.head()
Out[29]:
In [30]:
skills_final['domain'] = 'Skills'
skills_pt = skills_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
skills_pt.head()
Out[30]:
In [31]:
#for work activities, we only want to keep rows where scale_id == 'IM'
work_activities_final = work_activities[work_activities.scale_id == 'IM']
In [32]:
len(work_activities_final)
Out[32]:
In [33]:
feature(work_activities_final)
Out[33]:
In [34]:
work_activities_final['domain'] = 'Work_Activities'
work_activities_pt = work_activities_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
work_activities_pt.head()
Out[34]:
In [35]:
#in work context, we only want to keep rows where scale_id == 'CX' or 'CT'
work_context_final = work_context[(work_context['scale_id'] == 'CX') | (work_context['scale_id'] == 'CT')]
In [36]:
len(work_context_final)
Out[36]:
In [37]:
feature(work_context_final)
Out[37]:
In [38]:
work_context_final_CX = work_context_final[work_context_final['scale_id'] == 'CX']
work_context_final_CT = work_context_final[work_context_final['scale_id'] == 'CT']
In [39]:
work_context_final_CX['domain'] = 'Work_Context'
work_context_CX_pt = work_context_final_CX.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
work_context_CX_pt.head()
Out[39]:
In [40]:
work_context_final_CT['domain'] = 'Work_Context_Time'
work_context_CT_pt = work_context_final_CT.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
work_context_CT_pt.head()
Out[40]:
In [41]:
#in work styles, we can keep everything
work_styles_final = work_styles
In [42]:
len(work_styles_final)
Out[42]:
In [43]:
feature(work_styles_final)
Out[43]:
In [44]:
work_styles_final['domain'] = 'Work_Styles'
work_styles_pt = work_styles_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
work_styles_pt.head()
Out[44]:
In [45]:
#in work values, we want to only keep rows where scale_id == 'EX'
work_values_final = work_values[work_values.scale_id == 'EX']
In [46]:
len(work_values_final)
Out[46]:
In [47]:
feature(work_values_final)
Out[47]:
In [48]:
work_values_final['domain'] = 'Work_Values'
work_values_pt = work_values_final.pivot_table('data_value',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
work_values_pt.head()
Out[48]:
In [49]:
occupation_data['element_name'] = "title"
occupation_data['domain'] = 'Occupation'
occ_data_pt = occupation_data.pivot_table('title',
rows = 'onet_soc_code',
cols = ['domain', 'element_name'],
aggfunc = 'sum')
#combined_df = combined_df.rename(columns=lambda x: x.replace(' ', '_'))
occ_data_pt.Occupation.title = occ_data_pt.Occupation.title.apply(lambda x: x.replace(' ', '_'))
occ_data_pt.Occupation.title = occ_data_pt.Occupation.title.apply(lambda x: x.replace('/', '_'))
occ_data_pt.Occupation.title = occ_data_pt.Occupation.title.apply(lambda x: x.replace(',', '_'))
occ_data_pt.tail()
Out[49]:
In [50]:
domain_pt_list = [abilities_pt, knowledge_pt, interests_pt, job_zones_pt, skills_pt, work_activities_pt, work_context_CX_pt, work_context_CT_pt, work_styles_pt, work_values_pt]
combined_df = pd.concat(domain_pt_list, axis=1)
combined_df = pd.merge(occ_data_pt, combined_df, left_index = True, right_index = True)
combined_df.head()
Out[50]:
In [51]:
# Remove spaces in element names
combined_df = combined_df.rename(columns=lambda x: x.replace(' ', '_'))
combined_df.head()
Out[51]:
In [52]:
combined_df.to_csv("onet_data.csv")
We use x - minimum/(maximum - minimum) for the normalization, to get all of the features to be between 0 and 1
In [53]:
def normalize(series):
maximum = series.max()
minimum = series.min()
return [(item - minimum) / (maximum - minimum) for item in series]
In [54]:
normed_df = combined_df.copy()
normed_df.iloc[:,1:] = normed_df.iloc[:,1:].apply(normalize)
normed_df.head()
Out[54]:
In [55]:
normed_df.to_csv("onet_data_normalized.csv")
In this section, we view histograms of all of the features. In the beginning, this helped us identify problems with the data set (such as incorrectly filling in NaNs with 0 before the normalization, which skewed the data.
In [56]:
#function to draw histograms for a particular domain
from math import floor,ceil
def draw_histogram(domain_frame):
fig, axes = plt.subplots(nrows=int((ceil(float(len(domain_frame.columns))/3.0))), ncols=3, figsize = (12,len(domain_frame.columns)))
plt.subplots_adjust(hspace = 0.4)
for i,column_name in enumerate(domain_frame.columns):
row = int(floor(i/3))
column = i % 3
domain_frame[column_name].hist(bins=10, ax=axes[row,column]); axes[row,column].set_title(column_name); axes[row,column].set_ylim([0,500])
In [57]:
# draw_histogram(combined_df.Job_Zones)
# normed_df.Interests.Investigative.hist(bins=10)
draw_histogram(normed_df.Interests)
In [58]:
draw_histogram(normed_df.Abilities)
In [59]:
draw_histogram(normed_df.Knowledge)
In [60]:
draw_histogram(normed_df.Skills)
In [61]:
draw_histogram(normed_df.Work_Activities)
In [62]:
draw_histogram(normed_df.Work_Context)
In [63]:
draw_histogram(normed_df.Work_Styles)
In [64]:
draw_histogram(normed_df.Work_Values)
We used this to evaluate the relationships between variables. In future work, we could use these results to do feature selection with the end goal of obtaining better clustering results.
In [65]:
corr_df = normed_df.iloc[:,1:].corr()
In [66]:
corr_df.index = corr_df.index.droplevel(0)
corr_df.head()
Out[66]:
In [67]:
corr_df.columns = corr_df.columns.droplevel(0)
corr_df.head()
Out[67]:
In [68]:
corr_pairs_list = []
for i in range(len(corr_df.index)):
row_name = corr_df.index[i]
for j in range(i + 1, len(corr_df.columns)):
column_name = corr_df.columns[j]
corr_pairs_list.append([row_name,column_name, corr_df.ix[i,j]])
In [69]:
corr_pairs_df = DataFrame(corr_pairs_list)
In [70]:
#here are the correlations sorted in ascending order
#not surprisingly, spending time sitting is negatively correlated with features measuring strength and physical activity
corr_pairs_df.sort(2)
Out[70]:
In [71]:
#here are the correlations presented in descending order
corr_pairs_df.sort(2, ascending=False)
Out[71]:
We experimented with this weighting scheme to downweight features in the domains that have a high number of features, but ultimately did not use the weighted values to avoid introducing biases into the data
In [72]:
# creating array of column weights when equal
normed_df.columns
weighted_df = normed_df.copy()
domains = ['Abilities','Interests','Job_Zones','Knowledge','Skills','Work_Activities','Work_Context','Work_Context_Time','Work_Styles','Work_Values']
for domain in domains:
domain_frame = weighted_df[domain]
n_cols = len(domain_frame.columns)
weighted_domain_frame = domain_frame/n_cols
weighted_df[domain] = weighted_domain_frame
weighted_df.head()
Out[72]:
In [73]:
#get rid of occupations that have to many NAs
nan_count = len(normed_df.columns) - normed_df.count(axis=1)
In [74]:
nan_count.unique()
Out[74]:
In [75]:
nan_count.hist()
Out[75]:
In [76]:
len(weighted_df.columns)
Out[76]:
In [77]:
len(nan_count[nan_count==35])
Out[77]:
We ultimately decided to get rid of any occupations that had any NaNs among the features. Those that had NaNs were almost all NaN, with the exception of one occupation that had 35 NaNs (out of 248 features). We removed because we felt we didn't have sufficient information to calculate meaningful distance measures for these occupations.
In [78]:
#get rid of occupations that have nans
weighted_df_no_na = weighted_df.dropna(how='any')
normed_df_no_na = normed_df.dropna(how='any')
In [79]:
occ_titles = weighted_df_no_na.Occupation.title
In [80]:
#calculate for weighted features
euclid_dist_array_weighted = sklearn.metrics.pairwise.euclidean_distances(weighted_df_no_na.iloc[:,1:])
euclid_dist_df_weighted = DataFrame(euclid_dist_array_weighted).set_index(occ_titles)
euclid_dist_df_weighted.columns = occ_titles
euclid_dist_df_weighted.head()
Out[80]:
In [81]:
#calculate for normed features
euclid_dist_array_normed = sklearn.metrics.pairwise.euclidean_distances(normed_df_no_na.iloc[:,1:])
euclid_dist_df_normed = DataFrame(euclid_dist_array_normed).set_index(occ_titles)
euclid_dist_df_normed.columns = occ_titles
euclid_dist_df_normed.head()
Out[81]:
In [82]:
max(euclid_dist_df_weighted.max())
Out[82]:
In [83]:
max(euclid_dist_df_normed.max())
Out[83]:
In the commented out code below, we tried to loop through different parameters for the DBSCAN and save the silhouette coefficient, which we wanted to optimize (get it close to 1). It takes close to an hour to run. The results were not good. We ran into some errors for certain combinations of eps and min_samples values. Additionally, all of our silhouette coefficients were negative or very close to zero, indicating a lot of overlap among the clusters.
We concluded that DBSCAN was not the appropriate clustering method, as our data may not meet the criteria of having high density clusters surrounded by low density errors. Given the high-dimensionality of our data set, it is also challenging to visualize the different occupations. Later, we attempt a PCA transformation to visualize, but find that the variance in our dataset cannot be explained by two or three PCA vectors.
In [84]:
# #set up
# eps_values = np.arange(2.0,4,0.05)
# min_samples = np.arange(5,20)
In [85]:
# density_results = []
# # # http://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#example-cluster-plot-dbscan-pySee
# normed_array = np.array(normed_df_no_na.iloc[:,1:])
# from sklearn.cluster import DBSCAN
# from sklearn import metrics
# for eps_value in eps_values:
# for min_sample in min_samples:
# db = DBSCAN(eps=eps_value, min_samples=min_sample).fit(normed_array)
# core_samples = db.core_sample_indices_
# labels = db.labels_
# #Number of clusters in labels
# n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
# coeff = metrics.silhouette_score(normed_array, labels)
# density_results.append((eps_value, min_sample, n_clusters_, coeff))
In [86]:
# density_results
This is an illustrative output of our clustering outputs
In [87]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
normed_array = np.array(normed_df_no_na.iloc[:,1:])
db = DBSCAN(eps=2.1, min_samples=10).fit(normed_array)
core_samples = db.core_sample_indices_
labels = db.labels_
num_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
coeff = metrics.silhouette_score(normed_array, labels)
print 'Number of Clusters: ' + str(num_clusters_)
print 'Noise: ' + str(len(labels[labels==-1])) + ' data points'
print 'Silhouette Coefficient: ' + str(coeff)
In [88]:
num_clusters_
Out[88]:
PCA outputs show that less than 50% of the variance in the data set can be explained with two vectors, which is not good
In [90]:
from sklearn.decomposition import PCA
import seaborn as sns
from mpld3 import enable_notebook, disable_notebook
from mpld3 import plugins
enable_notebook()
pca = PCA(n_components=2).fit(normed_array)
pca.explained_variance_ratio_
Out[90]:
Here, we can visualize the results with an interactive scatterplot, with the caveat that the locations of the points in space are based on the two PCA components detailed above.
In [91]:
pca_2d = pca.transform(normed_array)
fig, ax = plt.subplots(subplot_kw=dict(axisbg='#EEEEEE'))
N = 100
current_palette = sns.color_palette("husl", num_clusters_ + 1)
for i in range(0, pca_2d.shape[0]):
db_index = int(db.labels_[i])
scatter = ax.scatter(pca_2d[i,0], pca_2d[i,1],
c= current_palette[db_index],
alpha=0.6)
labels = [np.array(occ_titles)[i]]
tooltip = plugins.PointLabelTooltip(scatter, labels=labels)
plugins.connect(fig, tooltip)
ax.grid(color='white', linestyle='solid')
ax.set_title("Scatter Plot (with tooltips!)", size=20)
Out[91]:
In the above, the purple dots are noise data points that weren't able to be clustered- this was more than half of the occupations.
The dark green clusters look to be mechanical-repair fields. The light green occupations seem similar.
The gold dots largely involve the medical field.
The blue dots are mechanica/manufactuering engineers
The red dots encompass a wide-variety of office jobs.
In [92]:
disable_notebook()
Following dbscan, we try hierarchical clustering. The number of clusters can be varied depending on how fine-grained you want the outputs to be. 50 clusters seems to yield useful results.
In [93]:
import time as time
import mpl_toolkits.mplot3d.axes3d as p3
from sklearn.cluster import Ward
from sklearn import metrics
In [94]:
normed_array = np.array(normed_df_no_na.iloc[:,1:])
In [95]:
# Compute clustering
print "Compute structured hierarchical clustering..."
st = time.time()
ward = Ward(n_clusters=50, connectivity=None).fit(normed_array)
label = ward.labels_
print "Elapsed time: ", time.time() - st
print "Number of points: ", label.size
In [96]:
clusters = DataFrame(data = label, index = occ_titles)
clusters.columns = ['cluster']
In [97]:
for cluster in range(clusters.cluster.max()):
current_cluster = clusters[clusters.cluster == cluster]
print '\nOccupations in cluster ' + str(cluster) + ' (' + str(len(current_cluster)) + ' total):'
for occ in current_cluster.index:
print ' ' + occ
In [98]:
#Function to take in a career and return other careers in the same cluster
def getCluster(occ_title):
occ_cluster = clusters.ix[occ_title]
cluster_data = clusters[clusters.cluster == occ_cluster.cluster]
print '\nOccupations that are similar to ' + occ_title + ' are:'
for occ in cluster_data.index:
if occ != occ_title:
print ' ' + occ
# return cluster_data
In [99]:
getCluster('Aerospace_Engineers')
We consider unique to be those careers that are outside of the 95th percentile in terms of distance to the nearest neighboring occupation
In [100]:
#calculate, for each occupation, the distance to get to the 1st, 2nd, 3rd, 4th, and 5th closest occupations by distance
closest_five_distances = DataFrame()
for occ_title in occ_titles:
distances = DataFrame(euclid_dist_df_normed.xs(occ_title))
distances
distances.sort(occ_title,inplace=True)
low_distances = distances.iloc[1:,:].head(5).T
low_distances.columns = [1,2,3,4,5]
closest_five_distances= pd.concat([closest_five_distances,low_distances])
In [101]:
closest_five_stats = closest_five_distances.describe(percentile_width=90)
In [102]:
def get_unique_careers(distance_threshold = closest_five_stats.ix['95%'][1]):
unique_occs = []
for occ_title in occ_titles:
distances = euclid_dist_df_normed.xs(occ_title)
nearby_occs = len(distances[distances <= distance_threshold])-1
if nearby_occs == 0:
unique_occs.append((occ_title, nearby_occs))
unique_careers = DataFrame(unique_occs)
print '\nThe most unique occupations are: '
for occ in unique_careers[0]:
print ' ' + occ
In [103]:
get_unique_careers()
In [104]:
#function used elsewhere to print out all of the domains and features in an occupation data frame
def print_domains_features(occ_frame):
for domain in occ_frame.index.get_level_values('domain').unique():
sub_frame = occ_frame.xs(domain, level=0)
print ' ' + domain.replace("_", " ")
for feature in sub_frame.index:
print ' ' + feature.replace("_", " ")
In [105]:
print 'Here is a list of all of the possible occupations:'
set(occ_titles)
Out[105]:
In [106]:
#function to find out what are the most important features for a particular career
#this picks all features that are greater than 0.8, or the top 20 if there are more than 20
#we should remove the job_zone and Work_Context_Time domains, because 1 doesn't mean importance for those categories
def important_features(occ_title):
selected_occ_series = normed_df_no_na[normed_df_no_na.Occupation.title == occ_title].T.drop('Occupation',level=0)
selected_occ_series.columns = [occ_title]
selected_occ_series.drop(['Job_Zones','Work_Context_Time'], level=0, inplace=True)
important_features = selected_occ_series[selected_occ_series[occ_title] >= 0.8]
if len(important_features) > 20:
important_features = selected_occ_series.sort(occ_title, ascending=False).head(20)
elif len(important_features) < 10:
important_features = selected_occ_series.sort(occ_title, ascending=False).head(10)
important_features = important_features.sort(axis=0)
print 'The most important attributes for ' + occ_title.replace('_', ' ') + ' are:'
print_domains_features(important_features)
return important_features
important = important_features('Business_Intelligence_Analysts')
In [107]:
#function to find out what are the least important features for a particular career
#this picks all features that are equal to 0
def irrelevant_features(occ_title):
selected_occ_series = normed_df_no_na[normed_df_no_na.Occupation.title == occ_title].T.drop('Occupation',level=0)
selected_occ_series.columns = [occ_title]
selected_occ_series.drop(['Job_Zones','Work_Context_Time'], level=0, inplace=True)
irrelevant_features = selected_occ_series[selected_occ_series[occ_title] == 0]
print 'The least important attributes for ' + occ_title + ' are:'
for domain in irrelevant_features.index.get_level_values('domain').unique():
sub_irrelevant_features = irrelevant_features.xs(domain, level=0)
print ' ' + domain
for feature in sub_irrelevant_features.index:
print ' ' + feature
print ''
return irrelevant_features
irrelevant = irrelevant_features('Business_Intelligence_Analysts')
In [108]:
#function to find 20 related careers to an input career
def closest_occs(occ_title, similar=20):
closest_df = DataFrame()
distance_series = euclid_dist_df_normed.xs(occ_title).order().head(similar)
selected_occ_series = normed_df_no_na[normed_df_no_na.Occupation.title == occ_title].T.drop('Occupation',level=0)
selected_occ_series.columns = [occ_title]
closest_occs_frame = selected_occ_series
for comparison in distance_series.index[1:]:
compare_occ_series = normed_df_no_na[normed_df_no_na.Occupation.title == comparison].T.drop('Occupation',level=0)
closest_occs_frame[comparison] = compare_occ_series.iloc[:,0]
# closest_feature_frame['difference'] = abs(closest_feature_frame[occ_title] - closest_feature_frame[comparison])
print "The closest occupations to " + occ_title.replace("_", " ") + " are:"
for x in closest_occs_frame.columns[1:]:
print ' ' +x.replace("_", " ")
print '\nSee the table below to see how similar the careers are.'
return closest_occs_frame.sort(occ_title, ascending=False)
closest_occs = closest_occs('Business_Intelligence_Analysts')
closest_occs
Out[108]:
In [109]:
#function to compare two occupations
def compare_occs(occ1, occ2):
occ1_series = normed_df_no_na[normed_df_no_na.Occupation.title == occ1].T.drop('Occupation',level=0)
occ1_series.columns = [occ1]
compare_frame = occ1_series
occ2_series = normed_df_no_na[normed_df_no_na.Occupation.title == occ2].T.drop('Occupation',level=0)
compare_frame[occ2] = occ2_series.iloc[:,0]
compare_frame.drop(['Job_Zones','Work_Context_Time'], level=0, inplace=True)
compare_frame['difference'] = compare_frame[occ1] - compare_frame[occ2]
compare_frame['abs_difference'] = abs(compare_frame['difference'])
#get the features with less than 0.1 distance between them that are above 0.8 for at least one of the occs
shared_high = compare_frame[((compare_frame[occ1] >= 0.8) | (compare_frame[occ2] >= 0.8)) & (compare_frame.abs_difference < 0.1)]
print '\nBoth ' + occ1.replace("_", " ") + ' and ' + occ2.replace("_", " ") + ' require similarly high degrees of:'
if len(shared_high) == 0:
print ' Nothing in common!'
else:
print_domains_features(shared_high)
#get the features that are more important for occ1 than occ2
more_important = compare_frame.sort('difference', ascending=False).head(10)
print '\n' + occ1.replace("_", " ") + ' require higher degrees of the following attributes than ' + occ2.replace("_", " ") + ':'
print_domains_features(more_important)
#get the features that are more important for occ2 than occ1
less_important = compare_frame.sort('difference', ascending=True).head(10)
print '\n' + occ2.replace("_", " ") + ' require higher degrees of the following attributes than ' + occ1.replace("_", " ") + ':'
print_domains_features(less_important)
#get the feature neither occ needs
not_needed = compare_frame[(compare_frame[occ1] ==0) & (compare_frame[occ2] == 0)]
print '\nNeither ' + occ1.replace("_"," ") + ' nor ' + occ2.replace("_"," ") + ' require:'
print_domains_features(not_needed)
print ''
# return more_important
compare_occs('Business_Intelligence_Analysts','Information_Technology_Project_Managers')