In [ ]:
# keep track of titles which cannot be parsed
global invalid_titles; invalid_titles = []
In [ ]:
import parse_funs as parse_funs; from parse_funs import *
In [ ]:
# needed dirs
HOME_DIR = 'd:/larc_projects/job_analytics/'
DATA_DIR = HOME_DIR + 'data/clean/'
In [ ]:
df = pd.read_csv(DATA_DIR + 'doc_index_filter.csv')
In [ ]:
titles = list(df['title'].unique())
n_title = len(titles)
print('# unique titles: %d' % n_title)
In [ ]:
title_stats = pd.read_csv(DATA_DIR + 'stats_job_titles.csv')
We need to divide the parsing process into several medium-size batches as this is a good practice when we are using a web service or access web/remote server. The division also allows us to easily locate a bug when it occurs.
In [ ]:
def parseBatch(b, start=None, end=None):
'''
@brief: parse a batch of 100 titles
@return: DF containing results of parsing the 100 titles in batch b
'''
print('Parsing batch {}...'.format(b))
if (not start) and (not end): start = 100*b; end = start + 100
batch_titles = titles[start:end]
frames = [parse(t) for t in batch_titles]
res = pd.concat(frames)
res.reset_index(inplace=True); del res['index']
time.sleep(3)
# defensive save result of the batch
fname = DATA_DIR + 'tmp/b{}.csv'.format(b)
res.to_csv(fname, index=False)
print('\t saved result to tmp file')
return res
In [ ]:
n_batch = n_title/100; remainder = n_title % 100
In [ ]:
frames = [parseBatch(b) for b in range(n_batch)]
rem_titles = titles[-remainder:]
last_frame = pd.concat([parse(t) for t in rem_titles])
frames.append(last_frame)
res = pd.concat(frames)
print res.shape
In [ ]:
print('# dups in parsing result: %d' %sum(res.duplicated('title')))
res[res.duplicated('title')]
In [ ]:
invalid_titles = res.title[res.duplicated('title')].unique()
print('# invalid titles: %d' %len(invalid_titles))
pd.DataFrame({'title': invalid_titles}).to_csv(DATA_DIR + 'invalid_titles.csv', index=False)
In [ ]:
res = res.drop_duplicates('title')
print res.shape
res.domain.fillna('', inplace=True); res.position.fillna('', inplace=True)
res.pri_func.fillna('', inplace=True); res.sec_func.fillna('', inplace=True)
In [ ]:
res.to_csv(DATA_DIR + 'parsed_titles.csv', index=False)
In [ ]:
def camelCase(s):
return s.title()
def joinValidParts(row):
'''
@brief: Only join valid parts with data, parts which are empty str are removed
'''
parts = [row.position, row.domain, row.sec_func, row.pri_func]
valid_parts = [p for p in parts if p != '']
# print('# valid parts: %d' %len(valid_parts))
return ' '.join(valid_parts)
In [ ]:
# This naive join will cause confusing results
res['std_title'] = res.position + ' ' + res.domain + ' ' + res.sec_func + ' ' + res.pri_func
The following helps to detect the confusing problem: non_std_title seems identical with title.
In [ ]:
row = res.query('non_std_title != std_title').iloc[0]
my_util.tagChanges(row.non_std_title, row.title)
It shows that the problem is due to NA parts, e.g. sec-func, which add spaces (almost invisible to naked eyes) to the std-ized version. Thus we need a better join which combines only valid parts. That made me create joinValidParts().
In [ ]:
row = res.query('non_std_title == "Site Engineer"').iloc[0]
print joinValidParts(row)
' '.join([row.position, row.domain, row.sec_func, row.pri_func])
In [ ]:
res['std_title'] = res.apply(joinValidParts, axis=1)
res.std_title = res.std_title.apply(camelCase)
In [ ]:
res.rename(columns={'std_title': 'title'}, inplace=True)
res.to_csv(DATA_DIR + 'parsed_titles.csv', index=False)
In [ ]:
stdForm = dict(zip(res.non_std_title, res.title))
print('# non-std titles which can be standardized: %d' %len(stdForm.keys()))
uniq_std_titles = np.unique((stdForm.values()))
print('# unique standard titles: %d' % len(uniq_std_titles))
In [ ]:
def toStd(t):
if t not in stdForm.keys():
return t
else:
if stdForm[t] == '': # for titles which could not be parsed
return t
else:
return stdForm[t]
# return stdForm[t] if t in stdForm.keys() else t
In [ ]:
df = pd.read_csv(DATA_DIR + 'doc_index_filter.csv')
df.columns
In [ ]:
uniq_non_std_titles = df['non_std_title'].unique()
# sum([t not in stdForm.keys() for t in uniq_non_std_titles])
tmp = [t for t in uniq_non_std_titles if t not in stdForm.keys()]
tmp[:5]
In [ ]:
del df['title']
In [ ]:
print("# unique non-std titles in data: %d" % df.non_std_title.nunique())
In [ ]:
df['title'] = map(toStd, df['non_std_title'])
In [ ]:
df.query('non_std_title != title')[['non_std_title', 'title']].head()
any(df.title == '')
In [ ]:
df.to_csv(DATA_DIR + 'doc_index_filter.csv')
In [ ]:
by_title_agg = df.groupby('title').agg({'job_id': 'nunique'})
by_title_agg.rename(columns={'job_id': 'n_post'}, inplace=True)
by_title_agg.reset_index(inplace=True)
by_title_agg.head()
In [ ]:
titles2_ = by_title_agg.query('n_post >= 2')
print('# titles with >= 2 posts: %d' %titles2_.shape[0])
In [ ]:
title_df = pd.merge(titles2_, res)
print(title_df.shape[0])
In [ ]:
# res.columns
title_df.columns
In [ ]:
title_df.to_csv(DATA_DIR + 'new_titles_2posts_up.csv', index=False)
In [ ]:
by_domain = res.groupby('domain')
print('# domains: {}'.format(by_domain.ngroups))
by_domain_agg = by_domain.agg({'title': len})
by_domain_agg = by_domain_agg.add_prefix('n_').reset_index()
by_domain_agg.sort_values('n_title', ascending=False, inplace=True)
In [ ]:
by_domain_agg.describe().round(1)
Though the total no. of domains is large, we actually only interested in domains with at least $2$ job titles. The reason is the domains with only $1$ title give us no pairwise similarity score.
In [ ]:
domains_2 = by_domain_agg.query('n_title >= 2')
print('# domains with at least 2 job titles: %d' %len(domains_2))
In [ ]:
by_domain_agg.to_csv(DATA_DIR + 'stats_domains.csv', index=False)
In [ ]:
by_pri_func = res.groupby('pri_func')
print('# primary functions: {}'.format(by_pri_func.ngroups))
by_pri_func_agg = by_pri_func.agg({'title': len})
by_pri_func_agg = by_pri_func_agg.add_prefix('n_').reset_index()
by_pri_func_agg.sort_values('n_title', ascending=False, inplace=True)
by_pri_func_agg.to_csv(DATA_DIR + 'stats_pri_funcs.csv', index=False)
In [ ]:
by_pri_func_agg.describe().round(1)
In [ ]:
by_pri_func_agg.head(10)
In [ ]:
# r0 = requests.post(parse_url, auth=(user, pwd),
# json={"job_title":"accountant", "verbose": False})
# print r0.status_code
# print r0.json()['output']
r1 = requests.post(parse_url, auth=(user, pwd),
json={"job_title":"software developer", "verbose": False})
print r1.status_code
print r1.json()['output']
# r2 = requests.post(parse_url, auth=(user, pwd),
# json={"job_title":"pre-school teacher", "verbose": False})
# print r2.status_code
# print r2.json()['output']
# r3 = requests.post(parse_url, auth=(user, pwd),
# json={"job_title":"Assistant Civil and Structural Engineer",
# "verbose": False})
In [ ]:
r4 = requests.post(parse_url, auth=(user, pwd),
json={"job_title": "Security Analyst, Information Technology",
"verbose": False})
print r4.json()['output']
r5 = requests.post(parse_url, auth=(user, pwd),
json={"job_title": "Data analyst and scientist",
"verbose": False})
print r5.json()['output']