In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from scipy.sparse import hstack

In [2]:
train_data = pd.read_csv('resources/salary-train.csv')

In [3]:
train_data.head(10)


Out[3]:
FullDescription LocationNormalized ContractTime SalaryNormalized
0 International Sales Manager London ****k ****... London permanent 33000
1 An ideal opportunity for an individual that ha... London permanent 50000
2 Online Content and Brand Manager// Luxury Reta... South East London permanent 40000
3 A great local marketleader is seeking a perman... Dereham permanent 22500
4 Registered Nurse / RGN Nursing Home for Young... Sutton Coldfield NaN 20355
5 Sales and Marketing Assistant will provide adm... Crawley NaN 22500
6 Vacancy Ladieswear fashion Area Manager / Regi... UK permanent 32000
7 Reference: LR/JAN/**** Our client is one of th... Bristol permanent 30000
8 Sponsorship Manager London The Company A marke... Central London permanent 31500
9 About Barclays Barclays moves, lends, invests ... South East London permanent 42499

In [4]:
train_data['FullDescription'] = train_data['FullDescription'].str.lower()
train_data['FullDescription'] = train_data['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex=True)

train_data['LocationNormalized'] = train_data['LocationNormalized'].str.lower()
train_data['LocationNormalized'] = train_data['LocationNormalized'].replace('[^a-zA-Z0-9]', ' ', regex=True)

train_data['ContractTime'] = train_data['ContractTime'].str.lower()
train_data['ContractTime'] = train_data['ContractTime'].replace('[^a-zA-Z0-9]', ' ', regex=True)

train_data['LocationNormalized'].fillna('nan', inplace=True)
train_data['ContractTime'].fillna('nan', inplace=True)

In [5]:
train_data.head(10)


Out[5]:
FullDescription LocationNormalized ContractTime SalaryNormalized
0 international sales manager london k ... london permanent 33000
1 an ideal opportunity for an individual that ha... london permanent 50000
2 online content and brand manager luxury reta... south east london permanent 40000
3 a great local marketleader is seeking a perman... dereham permanent 22500
4 registered nurse rgn nursing home for young... sutton coldfield nan 20355
5 sales and marketing assistant will provide adm... crawley nan 22500
6 vacancy ladieswear fashion area manager regi... uk permanent 32000
7 reference lr jan our client is one of th... bristol permanent 30000
8 sponsorship manager london the company a marke... central london permanent 31500
9 about barclays barclays moves lends invests ... south east london permanent 42499

In [6]:
vectorizer = TfidfVectorizer(min_df=5)

In [7]:
description_vectorized = vectorizer.fit_transform(train_data['FullDescription'])

In [8]:
dict_vectorizer = DictVectorizer()

location_and_contract_time_vectorized = dict_vectorizer.fit_transform(train_data[['LocationNormalized', 'ContractTime']].to_dict('records'))

In [9]:
X = hstack([description_vectorized, location_and_contract_time_vectorized])

In [10]:
ridge = Ridge(alpha=1, random_state=241)

In [11]:
ridge.fit(X, train_data['SalaryNormalized'])


Out[11]:
Ridge(alpha=1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=241, solver='auto', tol=0.001)

In [12]:
test_data = pd.read_csv('resources/salary-test-mini.csv')

In [13]:
test_data.head(10)


Out[13]:
FullDescription LocationNormalized ContractTime SalaryNormalized
0 We currently have a vacancy for an HR Project ... Milton Keynes contract NaN
1 A Web developer opportunity has arisen with an... Manchester permanent NaN

In [14]:
test_description = test_data['FullDescription'].str.lower()
test_description = test_description.replace('[^a-zA-Z0-9]', ' ', regex=True)
test_location = test_data['LocationNormalized'].str.lower()
test_location = test_location.replace('[^a-zA-Z0-9]', ' ', regex=True)
test_contact_time = test_data['ContractTime'].str.lower()
test_contact_time = test_contact_time.replace('[^a-zA-Z0-9]', ' ', regex=True)

In [15]:
test_description_vectorized = vectorizer.transform(test_description)
test_location_and_contract_time_vectorized = dict_vectorizer.transform([test_location, test_contact_time])

In [16]:
Y = hstack([test_description_vectorized, test_location_and_contract_time_vectorized])

In [17]:
ridge.predict(Y)


Out[17]:
array([53645.02006675, 39364.75513443])

In [18]:
train_data.head(10)


Out[18]:
FullDescription LocationNormalized ContractTime SalaryNormalized
0 international sales manager london k ... london permanent 33000
1 an ideal opportunity for an individual that ha... london permanent 50000
2 online content and brand manager luxury reta... south east london permanent 40000
3 a great local marketleader is seeking a perman... dereham permanent 22500
4 registered nurse rgn nursing home for young... sutton coldfield nan 20355
5 sales and marketing assistant will provide adm... crawley nan 22500
6 vacancy ladieswear fashion area manager regi... uk permanent 32000
7 reference lr jan our client is one of th... bristol permanent 30000
8 sponsorship manager london the company a marke... central london permanent 31500
9 about barclays barclays moves lends invests ... south east london permanent 42499