notebook.community

Edit and run



In [1]:

    
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from scipy.sparse import hstack



In [2]:

    
train_data = pd.read_csv('resources/salary-train.csv')



In [3]:

    
train_data.head(10)









    Out[3]:







  
    
      
      FullDescription
      LocationNormalized
      ContractTime
      SalaryNormalized
    
  
  
    
      0
      International Sales Manager London ****k  ****...
      London
      permanent
      33000
    
    
      1
      An ideal opportunity for an individual that ha...
      London
      permanent
      50000
    
    
      2
      Online Content and Brand Manager// Luxury Reta...
      South East London
      permanent
      40000
    
    
      3
      A great local marketleader is seeking a perman...
      Dereham
      permanent
      22500
    
    
      4
      Registered Nurse / RGN  Nursing Home for Young...
      Sutton Coldfield
      NaN
      20355
    
    
      5
      Sales and Marketing Assistant will provide adm...
      Crawley
      NaN
      22500
    
    
      6
      Vacancy Ladieswear fashion Area Manager / Regi...
      UK
      permanent
      32000
    
    
      7
      Reference: LR/JAN/**** Our client is one of th...
      Bristol
      permanent
      30000
    
    
      8
      Sponsorship Manager London The Company A marke...
      Central London
      permanent
      31500
    
    
      9
      About Barclays Barclays moves, lends, invests ...
      South East London
      permanent
      42499



In [4]:

    
train_data['FullDescription'] = train_data['FullDescription'].str.lower()
train_data['FullDescription'] = train_data['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex=True)

train_data['LocationNormalized'] = train_data['LocationNormalized'].str.lower()
train_data['LocationNormalized'] = train_data['LocationNormalized'].replace('[^a-zA-Z0-9]', ' ', regex=True)

train_data['ContractTime'] = train_data['ContractTime'].str.lower()
train_data['ContractTime'] = train_data['ContractTime'].replace('[^a-zA-Z0-9]', ' ', regex=True)

train_data['LocationNormalized'].fillna('nan', inplace=True)
train_data['ContractTime'].fillna('nan', inplace=True)



In [5]:

    
train_data.head(10)









    Out[5]:







  
    
      
      FullDescription
      LocationNormalized
      ContractTime
      SalaryNormalized
    
  
  
    
      0
      international sales manager london     k      ...
      london
      permanent
      33000
    
    
      1
      an ideal opportunity for an individual that ha...
      london
      permanent
      50000
    
    
      2
      online content and brand manager   luxury reta...
      south east london
      permanent
      40000
    
    
      3
      a great local marketleader is seeking a perman...
      dereham
      permanent
      22500
    
    
      4
      registered nurse   rgn  nursing home for young...
      sutton coldfield
      nan
      20355
    
    
      5
      sales and marketing assistant will provide adm...
      crawley
      nan
      22500
    
    
      6
      vacancy ladieswear fashion area manager   regi...
      uk
      permanent
      32000
    
    
      7
      reference  lr jan      our client is one of th...
      bristol
      permanent
      30000
    
    
      8
      sponsorship manager london the company a marke...
      central london
      permanent
      31500
    
    
      9
      about barclays barclays moves  lends  invests ...
      south east london
      permanent
      42499



In [6]:

    
vectorizer = TfidfVectorizer(min_df=5)



In [7]:

    
description_vectorized = vectorizer.fit_transform(train_data['FullDescription'])



In [8]:

    
dict_vectorizer = DictVectorizer()

location_and_contract_time_vectorized = dict_vectorizer.fit_transform(train_data[['LocationNormalized', 'ContractTime']].to_dict('records'))



In [9]:

    
X = hstack([description_vectorized, location_and_contract_time_vectorized])



In [10]:

    
ridge = Ridge(alpha=1, random_state=241)



In [11]:

    
ridge.fit(X, train_data['SalaryNormalized'])









    Out[11]:





Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)



In [12]:

    
test_data = pd.read_csv('resources/salary-test-mini.csv')



In [13]:

    
test_data.head(10)









    Out[13]:







  
    
      
      FullDescription
      LocationNormalized
      ContractTime
      SalaryNormalized
    
  
  
    
      0
      We currently have a vacancy for an HR Project ...
      Milton Keynes
      contract
      NaN
    
    
      1
      A Web developer opportunity has arisen with an...
      Manchester
      permanent
      NaN



In [14]:

    
test_description = test_data['FullDescription'].str.lower()
test_description = test_description.replace('[^a-zA-Z0-9]', ' ', regex=True)
test_location = test_data['LocationNormalized'].str.lower()
test_location = test_location.replace('[^a-zA-Z0-9]', ' ', regex=True)
test_contact_time = test_data['ContractTime'].str.lower()
test_contact_time = test_contact_time.replace('[^a-zA-Z0-9]', ' ', regex=True)



In [15]:

    
test_description_vectorized = vectorizer.transform(test_description)
test_location_and_contract_time_vectorized = dict_vectorizer.transform([test_location, test_contact_time])



In [16]:

    
Y = hstack([test_description_vectorized, test_location_and_contract_time_vectorized])



In [17]:

    
ridge.predict(Y)









    Out[17]:





array([53645.02006675, 39364.75513443])



In [18]:

    
train_data.head(10)









    Out[18]:







  
    
      
      FullDescription
      LocationNormalized
      ContractTime
      SalaryNormalized
    
  
  
    
      0
      international sales manager london     k      ...
      london
      permanent
      33000
    
    
      1
      an ideal opportunity for an individual that ha...
      london
      permanent
      50000
    
    
      2
      online content and brand manager   luxury reta...
      south east london
      permanent
      40000
    
    
      3
      a great local marketleader is seeking a perman...
      dereham
      permanent
      22500
    
    
      4
      registered nurse   rgn  nursing home for young...
      sutton coldfield
      nan
      20355
    
    
      5
      sales and marketing assistant will provide adm...
      crawley
      nan
      22500
    
    
      6
      vacancy ladieswear fashion area manager   regi...
      uk
      permanent
      32000
    
    
      7
      reference  lr jan      our client is one of th...
      bristol
      permanent
      30000
    
    
      8
      sponsorship manager london the company a marke...
      central london
      permanent
      31500
    
    
      9
      about barclays barclays moves  lends  invests ...
      south east london
      permanent
      42499

	FullDescription	LocationNormalized	ContractTime	SalaryNormalized
0	International Sales Manager London **k **...	London	permanent	33000
1	An ideal opportunity for an individual that ha...	London	permanent	50000
2	Online Content and Brand Manager// Luxury Reta...	South East London	permanent	40000
3	A great local marketleader is seeking a perman...	Dereham	permanent	22500
4	Registered Nurse / RGN Nursing Home for Young...	Sutton Coldfield	NaN	20355
5	Sales and Marketing Assistant will provide adm...	Crawley	NaN	22500
6	Vacancy Ladieswear fashion Area Manager / Regi...	UK	permanent	32000
7	Reference: LR/JAN/**** Our client is one of th...	Bristol	permanent	30000
8	Sponsorship Manager London The Company A marke...	Central London	permanent	31500
9	About Barclays Barclays moves, lends, invests ...	South East London	permanent	42499

	FullDescription	LocationNormalized	ContractTime	SalaryNormalized
0	international sales manager london k ...	london	permanent	33000
1	an ideal opportunity for an individual that ha...	london	permanent	50000
2	online content and brand manager luxury reta...	south east london	permanent	40000
3	a great local marketleader is seeking a perman...	dereham	permanent	22500
4	registered nurse rgn nursing home for young...	sutton coldfield	nan	20355
5	sales and marketing assistant will provide adm...	crawley	nan	22500
6	vacancy ladieswear fashion area manager regi...	uk	permanent	32000
7	reference lr jan our client is one of th...	bristol	permanent	30000
8	sponsorship manager london the company a marke...	central london	permanent	31500
9	about barclays barclays moves lends invests ...	south east london	permanent	42499

	FullDescription	LocationNormalized	ContractTime	SalaryNormalized
0	We currently have a vacancy for an HR Project ...	Milton Keynes	contract	NaN
1	A Web developer opportunity has arisen with an...	Manchester	permanent	NaN