In [ ]:
# keep track of titles which cannot be parsed
global invalid_titles; invalid_titles = []

In [ ]:
import parse_funs as parse_funs; from parse_funs import *

In [ ]:
# needed dirs
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
  • Data loading:

In [ ]:
df = pd.read_csv(DATA_DIR + 'doc_index_filter.csv')

In [ ]:
titles = list(df['title'].unique())
n_title = len(titles)
print('# unique titles: %d' % n_title)

In [ ]:
title_stats = pd.read_csv(DATA_DIR + 'stats_job_titles.csv')

Parsing job titles

We need to divide the parsing process into several medium-size batches as this is a good practice when we are using a web service or access web/remote server. The division also allows us to easily locate a bug when it occurs.


In [ ]:
def parseBatch(b, start=None, end=None):
    '''
    @brief: parse a batch of 100 titles
    @return: DF containing results of parsing the 100 titles in batch b
    '''
    print('Parsing batch {}...'.format(b))
    if (not start) and (not end): start = 100*b; end = start + 100
    batch_titles = titles[start:end]
    frames = [parse(t) for t in batch_titles]
    res = pd.concat(frames)
    res.reset_index(inplace=True); del res['index']
    time.sleep(3)
    
#     defensive save result of the batch
    fname = DATA_DIR + 'tmp/b{}.csv'.format(b)
    res.to_csv(fname, index=False)
    print('\t saved result to tmp file')
    return res

In [ ]:
n_batch = n_title/100; remainder = n_title % 100

In [ ]:
frames = [parseBatch(b) for b in range(n_batch)]

rem_titles = titles[-remainder:]
last_frame = pd.concat([parse(t) for t in rem_titles])

frames.append(last_frame)

res = pd.concat(frames)
print res.shape
  • Check for duplicates in results:

In [ ]:
print('# dups in parsing result: %d' %sum(res.duplicated('title')))
res[res.duplicated('title')]
  • Save invalid titles:

In [ ]:
invalid_titles = res.title[res.duplicated('title')].unique()
print('# invalid titles: %d' %len(invalid_titles))
pd.DataFrame({'title': invalid_titles}).to_csv(DATA_DIR + 'invalid_titles.csv', index=False)
  • Rm dups due to invalid titles and replace NAs by empty strings (to avoid NAs destroying standard titles later):

In [ ]:
res = res.drop_duplicates('title')
print res.shape

res.domain.fillna('', inplace=True); res.position.fillna('', inplace=True)
res.pri_func.fillna('', inplace=True); res.sec_func.fillna('', inplace=True)

In [ ]:
res.to_csv(DATA_DIR + 'parsed_titles.csv', index=False)

Standardize Job Titles

The standard form for all titles is: position + domain + (secondary function) + primary function.


In [ ]:
def camelCase(s):
    return s.title()

def joinValidParts(row):
    '''
    @brief: Only join valid parts with data, parts which are empty str are removed
    '''
    parts = [row.position, row.domain, row.sec_func, row.pri_func]
    valid_parts = [p for p in parts if p != '']
#     print('# valid parts: %d' %len(valid_parts))
    return ' '.join(valid_parts)

In [ ]:
# This naive join will cause confusing results
res['std_title'] = res.position + ' ' + res.domain + ' ' + res.sec_func + ' ' + res.pri_func

The following helps to detect the confusing problem: non_std_title seems identical with title.


In [ ]:
row = res.query('non_std_title != std_title').iloc[0]
my_util.tagChanges(row.non_std_title, row.title)

It shows that the problem is due to NA parts, e.g. sec-func, which add spaces (almost invisible to naked eyes) to the std-ized version. Thus we need a better join which combines only valid parts. That made me create joinValidParts().


In [ ]:
row = res.query('non_std_title == "Site Engineer"').iloc[0]
print joinValidParts(row)

' '.join([row.position, row.domain, row.sec_func, row.pri_func])

In [ ]:
res['std_title'] = res.apply(joinValidParts, axis=1)
res.std_title = res.std_title.apply(camelCase)

From now on, title means standard title.


In [ ]:
res.rename(columns={'std_title': 'title'}, inplace=True)
res.to_csv(DATA_DIR + 'parsed_titles.csv', index=False)

In [ ]:
stdForm = dict(zip(res.non_std_title, res.title))

print('# non-std titles which can be standardized: %d' %len(stdForm.keys()))

uniq_std_titles = np.unique((stdForm.values()))
print('# unique standard titles: %d' % len(uniq_std_titles))

Standardizing Job Titles of Posts


In [ ]:
def toStd(t):
    if t not in stdForm.keys():
        return t
    else:
        if stdForm[t] == '': # for titles which could not be parsed
            return t
        else: 
            return stdForm[t]
            
    #     return stdForm[t] if t in stdForm.keys() else t

In [ ]:
df = pd.read_csv(DATA_DIR + 'doc_index_filter.csv')
df.columns

In [ ]:
uniq_non_std_titles = df['non_std_title'].unique()
# sum([t not in stdForm.keys() for t in uniq_non_std_titles])
tmp = [t for t in uniq_non_std_titles if t not in stdForm.keys()]
tmp[:5]

In [ ]:
del df['title']

In [ ]:
print("# unique non-std titles in data: %d" % df.non_std_title.nunique())

In [ ]:
df['title'] = map(toStd, df['non_std_title'])
  • Sanity check if the std-ation works:

In [ ]:
df.query('non_std_title != title')[['non_std_title', 'title']].head()
any(df.title == '')

In [ ]:
df.to_csv(DATA_DIR + 'doc_index_filter.csv')
  • Re-query job titles with at least 2 posts:

In [ ]:
by_title_agg = df.groupby('title').agg({'job_id': 'nunique'})
by_title_agg.rename(columns={'job_id': 'n_post'}, inplace=True)
by_title_agg.reset_index(inplace=True)
by_title_agg.head()

In [ ]:
titles2_ = by_title_agg.query('n_post >= 2')
print('# titles with >= 2 posts: %d' %titles2_.shape[0])

In [ ]:
title_df = pd.merge(titles2_, res)
print(title_df.shape[0])

In [ ]:
# res.columns
title_df.columns

In [ ]:
title_df.to_csv(DATA_DIR + 'new_titles_2posts_up.csv', index=False)

Domains in Job Titles


In [ ]:
by_domain = res.groupby('domain')
print('# domains: {}'.format(by_domain.ngroups))

by_domain_agg = by_domain.agg({'title': len})
by_domain_agg = by_domain_agg.add_prefix('n_').reset_index()
by_domain_agg.sort_values('n_title', ascending=False, inplace=True)

In [ ]:
by_domain_agg.describe().round(1)

Though the total no. of domains is large, we actually only interested in domains with at least $2$ job titles. The reason is the domains with only $1$ title give us no pairwise similarity score.


In [ ]:
domains_2 = by_domain_agg.query('n_title >= 2')
print('# domains with at least 2 job titles: %d' %len(domains_2))

In [ ]:
by_domain_agg.to_csv(DATA_DIR + 'stats_domains.csv', index=False)

Primary Functions in Job Titles


In [ ]:
by_pri_func = res.groupby('pri_func')
print('# primary functions: {}'.format(by_pri_func.ngroups))

by_pri_func_agg = by_pri_func.agg({'title': len})
by_pri_func_agg = by_pri_func_agg.add_prefix('n_').reset_index()
by_pri_func_agg.sort_values('n_title', ascending=False, inplace=True)

by_pri_func_agg.to_csv(DATA_DIR + 'stats_pri_funcs.csv', index=False)

In [ ]:
by_pri_func_agg.describe().round(1)

In [ ]:
by_pri_func_agg.head(10)

Test parser web service (src by Koon Han)


In [ ]:
# r0 = requests.post(parse_url, auth=(user, pwd), 
#                   json={"job_title":"accountant", "verbose": False})

# print r0.status_code
# print r0.json()['output']

r1 = requests.post(parse_url, auth=(user, pwd), 
                  json={"job_title":"software developer", "verbose": False})

print r1.status_code
print r1.json()['output']


# r2 = requests.post(parse_url, auth=(user, pwd), 
#                   json={"job_title":"pre-school teacher", "verbose": False})

# print r2.status_code
# print r2.json()['output']

# r3 = requests.post(parse_url, auth=(user, pwd), 
#                   json={"job_title":"Assistant Civil and Structural Engineer", 
#                         "verbose": False})

In [ ]:
r4 = requests.post(parse_url, auth=(user, pwd), 
                  json={"job_title": "Security Analyst, Information Technology",
                        "verbose": False})
print r4.json()['output']

r5 = requests.post(parse_url, auth=(user, pwd), 
                  json={"job_title": "Data analyst and scientist",
                        "verbose": False})
print r5.json()['output']