In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import datetime
import html2text
import re
from nltk.corpus import stopwords
In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
%matplotlib inline
In [3]:
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.offline.offline import _plot_html
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()
In [4]:
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
In [6]:
def clean_txt(text):
search = ['أ','آ','إ','ة','ى']
replace = ['ا','ا','ا','ه','ي']
for i in range(len(search)):
text = text.replace(search[i], replace[i])
words = re.split("[^0-9a-zA-Zابتثجحخدزرزسشصضطظعغفقكلومنهةىؤئء,يأآئ]*", text)
return " ".join(words).strip().lower()
def filter_exp_years(text):
#check if there are numbers
text = text.replace("-", ' - ')
text = text.replace("+", ' - ')
text = text.replace("/", ' - ')
text = text.replace("~", ' - ')
text = word_tokenize(text)
numbers = [int(s) for s in text if s.isdigit()]
words = [s.lower().strip() for s in text if not s.isdigit()]
per = 'year'
if('month' in words or 'months' in words):
per = 'month'
if(len(numbers) == 1):
if('less' in words or 'maximum' in words):
return (0,max(numbers), per)
elif('more' in words or 'above' in words or 'minimum' in words):
return (max(numbers),max(numbers), per)
else:
return (0,max(numbers), per)
elif (len(numbers) > 1):
return (min(numbers), max(numbers), per)
else:
return (0,0, per)
In [5]:
df = pd.read_csv('data_science_dataset_wuzzuf.csv')
In [18]:
# copy df and append new cols
df_2 = df.copy()
df_2['salary_avg'] = pd.Series(0, index=df.index)
df_2['clean_description'] = pd.Series('', index=df.index)
df_2['clean_job_requirements'] = pd.Series('', index=df.index)
df_2['date_year'] = pd.Series('', index=df.index)
df_2['date_month'] = pd.Series('', index=df.index)
df_2['experience_years_type'] = pd.Series('year', index=df.index)
df_2['experience_years_min'] = pd.Series(0, index=df.index)
df_2['experience_years_max'] = pd.Series(0, index=df.index)
In [19]:
err = 0
for i in range(df_2.shape[0]):
# filter salary
if(int(df_2.iloc[i]['salary_min']) <= 0 ):
df_2.set_value(i, 'salary_min', 0)
if(int(df_2.iloc[i]['salary_max']) <= 0 ):
df_2.set_value(i, 'salary_max', 0)
df_2.set_value(i, 'salary_avg', (df_2.iloc[i]['salary_max'] + df_2.iloc[i]['salary_min'])/2.0 )
# filter num vacancies
if(int(df_2.iloc[i]['num_vacancies']) <= 0 ):
df_2.set_value(i, 'num_vacancies', 0)
# filter exp years
exp_yrs = filter_exp_years(str(df_2.iloc[i]['experience_years']))
df_2.set_value(i, 'experience_years_min', exp_yrs[0])
df_2.set_value(i, 'experience_years_max', exp_yrs[1])
df_2.set_value(i, 'experience_years_type', exp_yrs[2])
# parse date/time
d = df_2.iloc[i]['post_date'].split('-')
df_2.set_value(i, 'date_year', d[0])
df_2.set_value(i, 'date_month', datetime.date(1900, int(d[1]), 1).strftime('%B'))
# filter job_reqs
df_2.set_value(i, 'clean_description', html2text.html2text(str(df_2.iloc[i]['description'])))
df_2.set_value(i, 'clean_job_requirements', html2text.html2text(str(df_2.iloc[i]['job_requirements'])))
# filter city_name
df_2.set_value(i, 'city_name', clean_txt(df_2.iloc[i]['city_name']))
In [20]:
# load cities lexicon
c_lex = []
for line in open('cities.csv'):
words = clean_txt(line).split(',')
c_lex.append(words)
In [21]:
# clean city names [filter|normalize]
city_jobs_counts = {}
dbl = 0
dbl_queue = []
for i in range(df.shape[0]):
if i+1%100 == 0: print(i)
city_words = word_tokenize(df_2.iloc[i]['city_name'].lower())
for c in c_lex:
for w in city_words:
if w in c or w.replace('ال','') in c:
df_2.set_value(i, 'city_name', c[0])
if c[0] in city_jobs_counts:
city_jobs_counts[c[0]] += 1
else:
city_jobs_counts[c[0]] = 0
# sort by top countries
df_2['city_name'].value_counts().sort_values(inplace=True, ascending=False)
In [23]:
df_2.iloc[208]
Out[23]:
In [24]:
# clean displayed_job_title
for i in range(df_2.shape[0]):
df_2.set_value(i, 'displayed_job_title', clean_txt(df_2.iloc[i]['displayed_job_title']))
In [25]:
# top cities jobs
dt = [ df_2['city_name'].value_counts().index[z] for z in range(7)]
counts = [ df_2['city_name'].value_counts()[z] for z in range(7)]
plt.figure(figsize=(12,8))
sns.barplot(x=dt,y=counts)
Out[25]:
In [29]:
# view per months [violinplot]
plt.figure(figsize=(16,18))
ax = sns.violinplot(x="date_month", y="views",hue='date_year', data=df_2,palette='rainbow',estimator=sum)
ax.set(xlabel='Month', ylabel='Views')
plt.show()
In [30]:
# view per months [bar] - per three years
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_month", y="views", data=df_2,hue='date_year',palette='rainbow',estimator=sum)
ax.set(xlabel='Month', ylabel='Views')
plt.show()
In [31]:
# sum(views) per month
df_2.pivot_table(values='views',index='date_month',columns='date_year',aggfunc=sum, fill_value=0)
Out[31]:
In [33]:
# years / views
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_year", y="views", data=df_2,palette='rainbow',estimator=sum)
ax.set(xlabel='Year', ylabel='Views')
plt.show()
In [40]:
# jobs published per months/year
plt.figure(figsize=(12,10))
ax = sns.countplot(x="date_month", hue='date_year' ,data=df_2)
ax.set(xlabel='Month', ylabel='# Jobs',title="Published Jobs")
plt.show()
In [41]:
df_2.pivot_table(values='views',index='date_month',columns='date_year',aggfunc=len, fill_value=0)
Out[41]:
In [43]:
# top 6 categories published jobs
plt.figure(figsize=(12,8))
ax = sns.countplot(x="job_category_1", hue='date_year' ,data=df_2, order=df_2['job_category_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Main Category', ylabel='# Jobs',title="Published Jobs/Category")
plt.show()
In [44]:
# jobs published category/month
plt.figure(figsize=(18,8))
ax = sns.countplot(x="job_category_1", hue='date_month' ,data=df_2, order=df_2['job_category_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Main Category', ylabel='# Jobs',title="Published Jobs/Category")
plt.show()
In [47]:
# jobs published industry/year
plt.figure(figsize=(18,10))
ax = sns.countplot(x="job_industry_1", hue='date_year' ,data=df_2, order=df_2['job_industry_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Main Industry', ylabel='# Jobs',title="All Published Jobs/Industry")
plt.show()
In [48]:
# jobs published industry/year
plt.figure(figsize=(18,8))
ax = sns.countplot(x="job_industry_1", hue='date_month' ,data=df_2, order=df_2['job_industry_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Main Industry', ylabel='# Jobs',title="Published Jobs/Industry")
plt.show()
In [51]:
# number of missed/existed job_category_2
print("Non-Selected: ",sum(df_2[df_2['job_category_2']!='Select']['job_category_2'].value_counts()))
print("Selected: ",df_2[df_2['job_category_2']=='Select']['job_category_2'].value_counts())
# number of missed/existed job_category_3
print("Non-Selected: ",sum(df_2[df_2['job_category_3']!='Select']['job_category_3'].value_counts()))
print("Selected: ",df_2[df_2['job_category_3']=='Select']['job_category_3'].value_counts())
In [52]:
# number of missed/existed job_category_2
print("Non-Selected: ",sum(df_2[df_2['job_industry_2']!='Select']['job_industry_2'].value_counts()))
print("Selected: ",df_2[df_2['job_industry_2']=='Select']['job_industry_2'].value_counts())
# number of missed/existed job_category_3
print("Non-Selected: ",sum(df_2[df_2['job_industry_3']!='Select']['job_industry_3'].value_counts()))
print("Selected: ",df_2[df_2['job_industry_3']=='Select']['job_industry_3'].value_counts())
In [54]:
# not dedicated salaries
print("Not Dedicated Salaries: ",df_2[(df_2['salary_min'] ==0) & (df_2['salary_min'] ==0)].shape[0])
In [56]:
# mean of min salaries / year
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_year", y="salary_min", data=df_2,palette='rainbow')
ax.set(xlabel='Year', ylabel='Mean(Min Salary)')
plt.show()
In [57]:
# mean of max salaries / year
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_year", y="salary_max", data=df_2,palette='rainbow')
ax.set(xlabel='Year', ylabel='Mean(Max Salary)')
plt.show()
In [58]:
# mean of max salaries / year
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_year", y="salary_avg", data=df_2,palette='rainbow')
ax.set(xlabel='Year', ylabel='Mean(Avg Salary)')
plt.show()
In [60]:
# mean of avg salaries / job_category_1 - all
plt.figure(figsize=(12,8))
ax = sns.barplot(x="salary_avg", y="job_category_1", data=df_2,palette='rainbow')
ax.set(xlabel='Mean(Avg Salary)', ylabel='Mean Category')
plt.show()
In [61]:
# avg salaries per top 6 published categories jobs
ax = sns.barplot(x="salary_avg", y="job_category_1", data=df_2,palette='rainbow',order=df_2['job_category_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Mean(Avg Salary)', ylabel='Mean Category')
plt.show()
In [63]:
# mean of avg salaries / job_industry_1 - all
plt.figure(figsize=(12,35))
ax = sns.barplot(x="salary_avg", y="job_industry_1", data=df_2,palette='rainbow')
ax.set(xlabel='Mean(Avg Salary)', ylabel='Mean Industry')
plt.show()
In [64]:
ax = sns.barplot(x="salary_avg", y="job_industry_1", data=df_2,palette='rainbow',order=df_2['job_industry_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Mean(Avg Salary)', ylabel='Mean Industry')
plt.show()
In [65]:
# Overall Mean Projects
df_2.pivot_table(index='date_month',columns='date_year',aggfunc='mean')
Out[65]:
In [67]:
# vacancies [months/years]
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_month", y="num_vacancies", data=df_2,hue='date_year',palette='rainbow',estimator=sum)
ax.set(xlabel='Month', ylabel='# Vacancies')
plt.show()
In [71]:
# no. career levels jobs / top job_categories
plt.figure(figsize=(12,8))
ax = sns.countplot(x="job_category_1", hue='career_level' ,data=df_2, order=df_2['job_category_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Category', ylabel='Career Level')
plt.show()
In [70]:
# career levels count / top job_industries
plt.figure(figsize=(16,8))
ax = sns.countplot(x="job_industry_1", hue='career_level' ,data=df_2, order=df_2['job_industry_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Industry', ylabel='# Jobs')
plt.show()
In [72]:
# remove english stop words
all_text = " ".join(df_2['displayed_job_title'])
clean_text = [word for word in all_text.split() if word not in stopwords.words('english')]
clean_text = " ".join(clean_text)
In [73]:
# WordCloud of Job Titles
wordcloud = WordCloud(max_font_size=30).generate(clean_text)
plt.figure(figsize=(12, 16), dpi=300, facecolor='w', edgecolor='k')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
In [75]:
# top 50 job titles [2,3]-grams
from sklearn.feature_extraction.text import CountVectorizer
word_vectorizer = CountVectorizer(ngram_range=(2,4), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df_2['displayed_job_title'])
frequencies = sum(sparse_matrix).toarray()[0]
In [76]:
job_title_keywords = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
job_title_keywords.sort_values(['frequency'], ascending=[False], inplace=True)
job_title_keywords.head(50)
Out[76]:
In [77]:
sns.heatmap(df_2.corr(),cmap='coolwarm',annot=True)
Out[77]:
In [78]:
# null heatmap
sns.heatmap(df_2.isnull(),yticklabels=False,cbar=False,cmap='viridis')
Out[78]:
In [79]:
# change categories to numbers
df_2['job_industry_1'] = pd.Categorical(df_2['job_industry_1'])
df_2['job_industry_1_codes'] = df_2['job_industry_1'].cat.codes
df_2['job_category_1'] = pd.Categorical(df_2['job_category_1'])
df_2['job_category_1_codes'] = df_2['job_category_1'].cat.codes
df_2['career_level'] = pd.Categorical(df_2['career_level'])
df_2['career_level_codes'] = df_2['career_level'].cat.codes
df_2['city_name'] = pd.Categorical(df_2['city_name'])
df_2['city_name_codes'] = df_2['city_name'].cat.codes
df_2['date_month'] = pd.Categorical(df_2['date_month'])
df_2['date_month_codes'] = df_2['date_month'].cat.codes
In [119]:
# Linear regression to predict views
train_data = df_2[df_2['views']>0][['experience_years_min', 'experience_years_max','date_month_codes','salary_avg',
'num_vacancies',
'job_industry_1_codes', 'job_category_1_codes', 'career_level_codes']]
train_cls = df_2[df_2['views']>0]['views']
In [120]:
# cross validation
X_train, X_test, y_train, y_test = train_test_split(train_data, train_cls, test_size=0.5, random_state=101)
In [121]:
lm = LinearRegression()
lm.fit(X_train,y_train)
Out[121]:
In [122]:
# Coefficients Table
coeff_df = pd.DataFrame(lm.coef_,train_data.columns,columns=['Coefficient'])
coeff_df
Out[122]:
In [124]:
# root mean squared errro | evaluation
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))
In [137]:
# Linear regression to predict avg salaries
train_data = df_2[df_2['salary_avg']>0][['experience_years_min', 'experience_years_max','date_month_codes',
'num_vacancies','views',
'job_industry_1_codes', 'job_category_1_codes', 'career_level_codes']]
train_cls = df_2[df_2['salary_avg']>0]['salary_avg']
In [138]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_cls, test_size=0.5, random_state=101)
lm = LinearRegression()
lm.fit(X_train,y_train)
coeff_df = pd.DataFrame(lm.coef_,train_data.columns,columns=['Coefficient'])
coeff_df
Out[138]:
In [139]:
predictions = lm.predict(X_test)
# root mean squared errro | evaluation
print('RMSE:', (metrics.mean_squared_error(y_test, predictions)))