Init



In [1]:

    
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import datetime
import html2text
import re
from nltk.corpus import stopwords



In [2]:

    
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
%matplotlib inline



In [3]:

    
from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from plotly.offline.offline import _plot_html
import cufflinks as cf

init_notebook_mode(connected=True)
cf.go_offline()









    











    



IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.



In [4]:

    
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer



In [6]:

    
def clean_txt(text):
    search = ['أ','آ','إ','ة','ى']
    replace = ['ا','ا','ا','ه','ي']
    
    
    for i in range(len(search)):
        text = text.replace(search[i], replace[i])
    
    words = re.split("[^0-9a-zA-Zابتثجحخدزرزسشصضطظعغفقكلومنهةىؤئء,يأآئ]*", text)
    
    return " ".join(words).strip().lower()


def filter_exp_years(text):
    #check if there are numbers
    text = text.replace("-", ' - ')
    text = text.replace("+", ' - ')
    text = text.replace("/", ' - ')
    text = text.replace("~", ' - ')
    text = word_tokenize(text)
    
    numbers = [int(s) for s in text if s.isdigit()]
    words = [s.lower().strip() for s in text if not s.isdigit()]
    
    per = 'year'
    if('month' in words or 'months' in words):
        per = 'month'
    if(len(numbers) == 1):
        if('less' in words or 'maximum' in words): 
            return (0,max(numbers), per)
        elif('more' in words or 'above' in words or 'minimum' in words):
            return (max(numbers),max(numbers), per)
        else:
            return (0,max(numbers), per)
    elif (len(numbers) > 1):
        return (min(numbers), max(numbers), per)
    else:
        return (0,0, per)

Load Data



In [5]:

    
df = pd.read_csv('data_science_dataset_wuzzuf.csv')

Define/Edit New Data Clone



In [18]:

    
# copy df and append new cols
df_2 = df.copy()
df_2['salary_avg'] = pd.Series(0, index=df.index)
df_2['clean_description'] = pd.Series('', index=df.index)
df_2['clean_job_requirements'] = pd.Series('', index=df.index)
df_2['date_year'] = pd.Series('', index=df.index)
df_2['date_month'] = pd.Series('', index=df.index)
df_2['experience_years_type'] = pd.Series('year', index=df.index)
df_2['experience_years_min'] = pd.Series(0, index=df.index)
df_2['experience_years_max'] = pd.Series(0, index=df.index)

Data Cleaning



In [19]:

    
err = 0
for i in range(df_2.shape[0]):
    # filter salary
    if(int(df_2.iloc[i]['salary_min']) <= 0 ):
        df_2.set_value(i, 'salary_min', 0)
    if(int(df_2.iloc[i]['salary_max']) <= 0 ):
        df_2.set_value(i, 'salary_max', 0)

    df_2.set_value(i, 'salary_avg', (df_2.iloc[i]['salary_max'] + df_2.iloc[i]['salary_min'])/2.0 )
    
    # filter num vacancies
    if(int(df_2.iloc[i]['num_vacancies']) <= 0 ):
        df_2.set_value(i, 'num_vacancies', 0)
    
    # filter exp years
    exp_yrs = filter_exp_years(str(df_2.iloc[i]['experience_years']))
    df_2.set_value(i, 'experience_years_min', exp_yrs[0])
    df_2.set_value(i, 'experience_years_max', exp_yrs[1])
    df_2.set_value(i, 'experience_years_type', exp_yrs[2])
    
    # parse date/time
    d = df_2.iloc[i]['post_date'].split('-')
    df_2.set_value(i, 'date_year', d[0])
    df_2.set_value(i, 'date_month', datetime.date(1900, int(d[1]), 1).strftime('%B'))
    
    # filter job_reqs
    df_2.set_value(i, 'clean_description', html2text.html2text(str(df_2.iloc[i]['description'])))
    df_2.set_value(i, 'clean_job_requirements', html2text.html2text(str(df_2.iloc[i]['job_requirements'])))
    
    # filter city_name
    df_2.set_value(i, 'city_name', clean_txt(df_2.iloc[i]['city_name']))









    



/usr/lib/python3.5/re.py:203: FutureWarning:

split() requires a non-empty pattern match.

City Names Normalizing | Cleaning



In [20]:

    
# load cities lexicon
c_lex = []
for line in open('cities.csv'):
    words = clean_txt(line).split(',')
    c_lex.append(words)









    



/usr/lib/python3.5/re.py:203: FutureWarning:

split() requires a non-empty pattern match.



In [21]:

    
# clean city names [filter|normalize]
city_jobs_counts = {}
dbl = 0
dbl_queue = []
for i in range(df.shape[0]):
    if i+1%100 == 0: print(i)
    city_words = word_tokenize(df_2.iloc[i]['city_name'].lower())

    for c in c_lex:
        for w in city_words:
            if w in c or  w.replace('ال','') in c:
                df_2.set_value(i, 'city_name', c[0])
                if c[0] in city_jobs_counts:
                    city_jobs_counts[c[0]] += 1
                else:
                    city_jobs_counts[c[0]] = 0

# sort by top countries
df_2['city_name'].value_counts().sort_values(inplace=True, ascending=False)



In [23]:

    
df_2.iloc[208]









    Out[23]:





id                                                                 afb02d06
city_name                                                             cairo
displayed_job_title                                      Web Content Writer
job_category_1                                            Editorial/Writing
job_category_2                                                    Marketing
job_category_3                                                     Research
job_industry_1                                          Writing and Editing
job_industry_2                                 Marketing and Advertising   
job_industry_3                                                  Research   
salary_min                                                             2000
salary_max                                                             2500
num_vacancies                                                             1
career_level                                                    Entry Level
experience_years                                                         1+
post_date                                               2014-01-22 15:01:21
views                                                                  1937
description               <p>Do you love to write? Are you proficient it...
job_requirements          <strong>You must meet the following criteria t...
salary_avg                                                             2250
clean_description         Do you love to write? Are you proficient it wr...
clean_job_requirements    **You must meet the following criteria to appl...
date_year                                                              2014
date_month                                                          January
experience_years_type                                                  year
experience_years_min                                                      0
experience_years_max                                                      1
Name: 208, dtype: object

Job Titles Normalizing | Cleaning



In [24]:

    
# clean displayed_job_title
for i in range(df_2.shape[0]):
    df_2.set_value(i, 'displayed_job_title', clean_txt(df_2.iloc[i]['displayed_job_title']))









    



/usr/lib/python3.5/re.py:203: FutureWarning:

split() requires a non-empty pattern match.

Analytics

Cities



In [25]:

    
# top cities jobs
dt = [ df_2['city_name'].value_counts().index[z] for z in range(7)]
counts = [ df_2['city_name'].value_counts()[z] for z in range(7)]

plt.figure(figsize=(12,8))
sns.barplot(x=dt,y=counts)









    Out[25]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd56c381ef0>

views



In [29]:

    
# view per months [violinplot]
plt.figure(figsize=(16,18))
ax = sns.violinplot(x="date_month", y="views",hue='date_year', data=df_2,palette='rainbow',estimator=sum)
ax.set(xlabel='Month', ylabel='Views')
plt.show()



In [30]:

    
# view per months [bar] - per three years
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_month", y="views", data=df_2,hue='date_year',palette='rainbow',estimator=sum)
ax.set(xlabel='Month', ylabel='Views')
plt.show()



In [31]:

    
# sum(views) per month
df_2.pivot_table(values='views',index='date_month',columns='date_year',aggfunc=sum, fill_value=0)



In [33]:

    
# years / views
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_year", y="views", data=df_2,palette='rainbow',estimator=sum)
ax.set(xlabel='Year', ylabel='Views')
plt.show()

Published Jobs



In [40]:

    
# jobs published per months/year
plt.figure(figsize=(12,10))
ax = sns.countplot(x="date_month", hue='date_year' ,data=df_2)
ax.set(xlabel='Month', ylabel='# Jobs',title="Published Jobs")
plt.show()



In [41]:

    
df_2.pivot_table(values='views',index='date_month',columns='date_year',aggfunc=len, fill_value=0)



In [43]:

    
# top 6 categories published jobs
plt.figure(figsize=(12,8))
ax = sns.countplot(x="job_category_1", hue='date_year' ,data=df_2, order=df_2['job_category_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Main Category', ylabel='# Jobs',title="Published Jobs/Category")
plt.show()



In [44]:

    
# jobs published category/month
plt.figure(figsize=(18,8))
ax = sns.countplot(x="job_category_1", hue='date_month' ,data=df_2, order=df_2['job_category_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Main Category', ylabel='# Jobs',title="Published Jobs/Category")
plt.show()



In [47]:

    
# jobs published industry/year
plt.figure(figsize=(18,10))
ax = sns.countplot(x="job_industry_1", hue='date_year' ,data=df_2, order=df_2['job_industry_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Main Industry', ylabel='# Jobs',title="All Published Jobs/Industry")
plt.show()



In [48]:

    
# jobs published industry/year
plt.figure(figsize=(18,8))
ax = sns.countplot(x="job_industry_1", hue='date_month' ,data=df_2, order=df_2['job_industry_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Main Industry', ylabel='# Jobs',title="Published Jobs/Industry")
plt.show()



In [51]:

    
# number of missed/existed job_category_2
print("Non-Selected: ",sum(df_2[df_2['job_category_2']!='Select']['job_category_2'].value_counts()))
print("Selected: ",df_2[df_2['job_category_2']=='Select']['job_category_2'].value_counts())

# number of missed/existed job_category_3
print("Non-Selected: ",sum(df_2[df_2['job_category_3']!='Select']['job_category_3'].value_counts()))
print("Selected: ",df_2[df_2['job_category_3']=='Select']['job_category_3'].value_counts())









    



Non-Selected:  11591
Selected:  Select    10259
Name: job_category_2, dtype: int64
Non-Selected:  3062
Selected:  Select    18788
Name: job_category_3, dtype: int64



In [52]:

    
# number of missed/existed job_category_2
print("Non-Selected: ",sum(df_2[df_2['job_industry_2']!='Select']['job_industry_2'].value_counts()))
print("Selected: ",df_2[df_2['job_industry_2']=='Select']['job_industry_2'].value_counts())

# number of missed/existed job_category_3
print("Non-Selected: ",sum(df_2[df_2['job_industry_3']!='Select']['job_industry_3'].value_counts()))
print("Selected: ",df_2[df_2['job_industry_3']=='Select']['job_industry_3'].value_counts())









    



Non-Selected:  9149
Selected:  Select    12701
Name: job_industry_2, dtype: int64
Non-Selected:  4074
Selected:  Select    17776
Name: job_industry_3, dtype: int64

Salaries



In [54]:

    
# not dedicated salaries
print("Not Dedicated Salaries: ",df_2[(df_2['salary_min'] ==0) & (df_2['salary_min'] ==0)].shape[0])









    



Not Dedicated Salaries:  189



In [56]:

    
# mean of min salaries / year
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_year", y="salary_min", data=df_2,palette='rainbow')
ax.set(xlabel='Year', ylabel='Mean(Min Salary)')
plt.show()



In [57]:

    
# mean of max salaries / year
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_year", y="salary_max", data=df_2,palette='rainbow')
ax.set(xlabel='Year', ylabel='Mean(Max Salary)')
plt.show()



In [58]:

    
# mean of max salaries / year
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_year", y="salary_avg", data=df_2,palette='rainbow')
ax.set(xlabel='Year', ylabel='Mean(Avg Salary)')
plt.show()



In [60]:

    
# mean of avg salaries / job_category_1 - all
plt.figure(figsize=(12,8))
ax = sns.barplot(x="salary_avg", y="job_category_1", data=df_2,palette='rainbow')
ax.set(xlabel='Mean(Avg Salary)', ylabel='Mean Category')
plt.show()



In [61]:

    
# avg salaries per top 6 published categories jobs
ax = sns.barplot(x="salary_avg", y="job_category_1", data=df_2,palette='rainbow',order=df_2['job_category_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Mean(Avg Salary)', ylabel='Mean Category')
plt.show()



In [63]:

    
# mean of avg salaries / job_industry_1 - all
plt.figure(figsize=(12,35))
ax = sns.barplot(x="salary_avg", y="job_industry_1", data=df_2,palette='rainbow')
ax.set(xlabel='Mean(Avg Salary)', ylabel='Mean Industry')
plt.show()



In [64]:

    
ax = sns.barplot(x="salary_avg", y="job_industry_1", data=df_2,palette='rainbow',order=df_2['job_industry_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Mean(Avg Salary)', ylabel='Mean Industry')
plt.show()



In [65]:

    
# Overall Mean Projects
df_2.pivot_table(index='date_month',columns='date_year',aggfunc='mean')









    Out[65]:







  
    
      
      experience_years_max
      experience_years_min
      num_vacancies
      salary_avg
      salary_max
      salary_min
      views
    
    
      date_year
      2014
      2015
      2016
      2014
      2015
      2016
      2014
      2015
      2016
      2014
      ...
      2016
      2014
      2015
      2016
      2014
      2015
      2016
      2014
      2015
      2016
    
    
      date_month
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      April
      3.217722
      3.863576
      3.567986
      0.832911
      1.075497
      1.081756
      8.987342
      4.719205
      6.048193
      3208.420253
      ...
      3977.730637
      3946.711392
      5838.357616
      4876.561962
      2470.129114
      3511.063576
      3078.901893
      1852.551899
      1366.577483
      987.766781
    
    
      August
      3.398374
      3.643469
      NaN
      0.973577
      1.118844
      NaN
      6.955285
      5.033191
      NaN
      4031.646341
      ...
      NaN
      5103.821138
      5343.888651
      NaN
      2959.471545
      3170.862955
      NaN
      1408.914634
      1567.472163
      NaN
    
    
      December
      3.769131
      3.566520
      NaN
      1.068742
      0.935683
      NaN
      5.181582
      6.769163
      NaN
      4314.578470
      ...
      NaN
      5356.390402
      6496.236123
      NaN
      3272.770428
      4197.595595
      NaN
      1448.962387
      1362.493392
      NaN
    
    
      February
      3.673522
      3.802372
      3.745540
      1.074550
      1.077734
      1.054460
      6.375321
      4.038208
      4.230047
      4117.030848
      ...
      4429.411268
      5103.084833
      5280.096179
      5447.897653
      3130.976864
      3231.524374
      3410.929577
      1586.197943
      1324.471673
      1144.381221
    
    
      January
      3.462046
      3.547200
      3.463500
      0.970297
      0.857600
      0.959543
      6.937294
      6.441600
      6.032542
      3794.821782
      ...
      4668.882146
      4730.026403
      5239.459200
      5589.662269
      2859.620462
      3044.396800
      3748.105541
      1416.844884
      1428.425600
      1145.574318
    
    
      July
      3.364185
      3.480118
      NaN
      0.782696
      1.078056
      NaN
      5.955734
      4.001473
      NaN
      3907.257545
      ...
      NaN
      5326.026157
      5295.702504
      NaN
      2488.507042
      3283.758468
      NaN
      1454.269618
      1614.217968
      NaN
    
    
      June
      3.347253
      3.543933
      NaN
      0.949451
      1.012552
      NaN
      6.896703
      5.938633
      NaN
      3895.206593
      ...
      NaN
      4841.760440
      5120.926081
      NaN
      2948.657143
      3013.824268
      NaN
      1530.175824
      1457.177127
      NaN
    
    
      March
      3.831878
      3.538551
      3.525342
      1.102620
      0.934579
      1.068383
      6.703057
      6.389019
      5.849558
      3955.329694
      ...
      4264.663717
      4956.672489
      7491.241822
      5231.854385
      2953.993450
      3540.587617
      3297.477876
      1624.644105
      1418.753505
      1187.883347
    
    
      May
      3.294118
      3.572057
      3.699647
      0.998102
      0.991199
      1.180212
      9.056926
      7.006601
      4.559482
      3489.791271
      ...
      4237.050648
      4265.284630
      5280.187019
      5221.664311
      2714.297913
      3059.213421
      3252.440518
      1608.189753
      1261.792079
      558.813899
    
    
      November
      3.754917
      3.479204
      NaN
      1.173979
      0.893309
      NaN
      6.237519
      6.216094
      NaN
      3872.636914
      ...
      NaN
      4785.467474
      6251.448463
      NaN
      2959.806354
      4141.556962
      NaN
      1540.397882
      1499.528029
      NaN
    
    
      October
      3.348231
      3.944186
      NaN
      0.813780
      1.052093
      NaN
      6.560521
      4.496744
      NaN
      3908.951583
      ...
      NaN
      4970.707635
      7169.018605
      NaN
      2847.199255
      4697.768372
      NaN
      1562.610801
      1598.983256
      NaN
    
    
      September
      3.403986
      3.762082
      NaN
      0.898551
      1.006196
      NaN
      5.536232
      5.809170
      NaN
      3773.369565
      ...
      NaN
      4869.021739
      5241.257745
      NaN
      2677.721014
      3185.247831
      NaN
      1583.326087
      1628.754647
      NaN
    
  

12 rows × 21 columns

Vacancies



In [67]:

    
# vacancies [months/years]
plt.figure(figsize=(12,8))
ax = sns.barplot(x="date_month", y="num_vacancies", data=df_2,hue='date_year',palette='rainbow',estimator=sum)
ax.set(xlabel='Month', ylabel='# Vacancies')
plt.show()

Career Levels



In [71]:

    
# no. career levels jobs / top job_categories
plt.figure(figsize=(12,8))
ax = sns.countplot(x="job_category_1", hue='career_level' ,data=df_2, order=df_2['job_category_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Category', ylabel='Career Level')
plt.show()



In [70]:

    
# career levels count / top job_industries
plt.figure(figsize=(16,8))
ax = sns.countplot(x="job_industry_1", hue='career_level' ,data=df_2, order=df_2['job_industry_1'].value_counts().iloc[:6].index)
ax.set(xlabel='Industry', ylabel='# Jobs')
plt.show()

job Titles



In [72]:

    
# remove english stop words
all_text = " ".join(df_2['displayed_job_title'])
clean_text = [word for word in all_text.split() if word not in stopwords.words('english')]
clean_text = " ".join(clean_text)



In [73]:

    
# WordCloud of Job Titles
wordcloud = WordCloud(max_font_size=30).generate(clean_text)
plt.figure(figsize=(12, 16), dpi=300, facecolor='w', edgecolor='k')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()



In [75]:

    
# top 50 job titles [2,3]-grams
from sklearn.feature_extraction.text import CountVectorizer
word_vectorizer = CountVectorizer(ngram_range=(2,4), analyzer='word')
sparse_matrix = word_vectorizer.fit_transform(df_2['displayed_job_title'])
frequencies = sum(sparse_matrix).toarray()[0]



In [76]:

    
job_title_keywords = pd.DataFrame(frequencies, index=word_vectorizer.get_feature_names(), columns=['frequency'])
job_title_keywords.sort_values(['frequency'], ascending=[False], inplace=True)
job_title_keywords.head(50)









    Out[76]:







  
    
      
      frequency
    
  
  
    
      customer service
      1000
    
    
      call center
      772
    
    
      graphic designer
      755
    
    
      sales engineer
      653
    
    
      marketing specialist
      613
    
    
      social media
      503
    
    
      web developer
      490
    
    
      center agent
      480
    
    
      call center agent
      478
    
    
      technical support
      421
    
    
      software developer
      398
    
    
      service agent
      366
    
    
      customer service agent
      365
    
    
      php developer
      361
    
    
      account manager
      357
    
    
      net developer
      328
    
    
      media specialist
      314
    
    
      web designer
      312
    
    
      social media specialist
      306
    
    
      telesales agent
      293
    
    
      service representative
      287
    
    
      customer service representative
      281
    
    
      software engineer
      278
    
    
      marketing executive
      278
    
    
      project manager
      269
    
    
      senior software
      263
    
    
      java developer
      248
    
    
      sales executive
      228
    
    
      technical office
      227
    
    
      marketing manager
      227
    
    
      digital marketing
      226
    
    
      quality control
      218
    
    
      support engineer
      212
    
    
      ios developer
      211
    
    
      quality assurance
      200
    
    
      sales representative
      197
    
    
      sales account
      195
    
    
      site engineer
      194
    
    
      mechanical engineer
      193
    
    
      android developer
      192
    
    
      maintenance engineer
      188
    
    
      support agent
      185
    
    
      senior net
      171
    
    
      sales account manager
      169
    
    
      senior php
      169
    
    
      office engineer
      158
    
    
      team leader
      156
    
    
      technical office engineer
      152
    
    
      front end
      149
    
    
      sales marketing
      133

Machine Learnign - Data Mining



In [77]:

    
sns.heatmap(df_2.corr(),cmap='coolwarm',annot=True)









    Out[77]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd56ead8898>



In [78]:

    
# null heatmap
sns.heatmap(df_2.isnull(),yticklabels=False,cbar=False,cmap='viridis')









    Out[78]:





<matplotlib.axes._subplots.AxesSubplot at 0x7fd56eb1f5c0>

Data Pre-Processing



In [79]:

    
# change categories to numbers
df_2['job_industry_1'] = pd.Categorical(df_2['job_industry_1'])
df_2['job_industry_1_codes'] = df_2['job_industry_1'].cat.codes

df_2['job_category_1'] = pd.Categorical(df_2['job_category_1'])
df_2['job_category_1_codes'] = df_2['job_category_1'].cat.codes

df_2['career_level'] = pd.Categorical(df_2['career_level'])
df_2['career_level_codes'] = df_2['career_level'].cat.codes

df_2['city_name'] = pd.Categorical(df_2['city_name'])
df_2['city_name_codes'] = df_2['city_name'].cat.codes

df_2['date_month'] = pd.Categorical(df_2['date_month'])
df_2['date_month_codes'] = df_2['date_month'].cat.codes

Views Linear Regression



In [119]:

    
# Linear regression to predict views
train_data = df_2[df_2['views']>0][['experience_years_min', 'experience_years_max','date_month_codes','salary_avg',
                   'num_vacancies',
                    'job_industry_1_codes', 'job_category_1_codes', 'career_level_codes']]
train_cls = df_2[df_2['views']>0]['views']



In [120]:

    
# cross validation
X_train, X_test, y_train, y_test = train_test_split(train_data, train_cls, test_size=0.5, random_state=101)



In [121]:

    
lm = LinearRegression()
lm.fit(X_train,y_train)









    Out[121]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [122]:

    
# Coefficients Table
coeff_df = pd.DataFrame(lm.coef_,train_data.columns,columns=['Coefficient'])
coeff_df









    Out[122]:







  
    
      
      Coefficient
    
  
  
    
      experience_years_min
      -7.188270
    
    
      experience_years_max
      18.157674
    
    
      date_month_codes
      12.963801
    
    
      salary_avg
      -0.001273
    
    
      num_vacancies
      4.111769
    
    
      job_industry_1_codes
      0.954166
    
    
      job_category_1_codes
      -11.335219
    
    
      career_level_codes
      -22.666678



In [124]:

    
# root mean squared errro | evaluation
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))









    



RMSE: 1106.31143616

Avg Salaries Regression



In [137]:

    
# Linear regression to predict avg salaries
train_data = df_2[df_2['salary_avg']>0][['experience_years_min', 'experience_years_max','date_month_codes',
                   'num_vacancies','views',
                   'job_industry_1_codes', 'job_category_1_codes', 'career_level_codes']]
train_cls = df_2[df_2['salary_avg']>0]['salary_avg']



In [138]:

    
X_train, X_test, y_train, y_test = train_test_split(train_data, train_cls, test_size=0.5, random_state=101)

lm = LinearRegression()
lm.fit(X_train,y_train)

coeff_df = pd.DataFrame(lm.coef_,train_data.columns,columns=['Coefficient'])
coeff_df









    Out[138]:







  
    
      
      Coefficient
    
  
  
    
      experience_years_min
      -442.363891
    
    
      experience_years_max
      1015.899177
    
    
      date_month_codes
      17.211951
    
    
      num_vacancies
      3.481535
    
    
      views
      -0.036280
    
    
      job_industry_1_codes
      0.339017
    
    
      job_category_1_codes
      11.779490
    
    
      career_level_codes
      1260.586171



In [139]:

    
predictions = lm.predict(X_test)
# root mean squared errro | evaluation
print('RMSE:', (metrics.mean_squared_error(y_test, predictions)))









    



RMSE: 57953206.19

date_year	2014	2015	2016
date_month
April	731758	1031766	1147785
August	693186	1464019	0
December	1117150	1546430	0
February	617031	1005274	1218766
January	429304	892766	1302518
July	722772	1096054	0
June	696230	1044796	0
March	744087	1214453	1476539
May	847516	1146969	474433
November	1018203	1658478	0
October	839122	1718907	0
September	873996	1314405	0

date_year	2014	2015	2016
date_month
April	395	755	1162
August	492	934	0
December	771	1135	0
February	389	759	1065
January	303	625	1137
July	497	679	0
June	455	717	0
March	458	856	1243
May	527	909	849
November	661	1106	0
October	537	1075	0
September	552	807	0

	frequency
customer service	1000
call center	772
graphic designer	755
sales engineer	653
marketing specialist	613
social media	503
web developer	490
center agent	480
call center agent	478
technical support	421
software developer	398
service agent	366
customer service agent	365
php developer	361
account manager	357
net developer	328
media specialist	314
web designer	312
social media specialist	306
telesales agent	293
service representative	287
customer service representative	281
software engineer	278
marketing executive	278
project manager	269
senior software	263
java developer	248
sales executive	228
technical office	227
marketing manager	227
digital marketing	226
quality control	218
support engineer	212
ios developer	211
quality assurance	200
sales representative	197
sales account	195
site engineer	194
mechanical engineer	193
android developer	192
maintenance engineer	188
support agent	185
senior net	171
sales account manager	169
senior php	169
office engineer	158
team leader	156
technical office engineer	152
front end	149
sales marketing	133

	Coefficient
experience_years_min	-7.188270
experience_years_max	18.157674
date_month_codes	12.963801
salary_avg	-0.001273
num_vacancies	4.111769
job_industry_1_codes	0.954166
job_category_1_codes	-11.335219
career_level_codes	-22.666678

	Coefficient
experience_years_min	-442.363891
experience_years_max	1015.899177
date_month_codes	17.211951
num_vacancies	3.481535
views	-0.036280
job_industry_1_codes	0.339017
job_category_1_codes	11.779490
career_level_codes	1260.586171