In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge
from scipy.sparse import hstack
In [2]:
train_data = pd.read_csv('resources/salary-train.csv')
In [3]:
train_data.head(10)
Out[3]:
In [4]:
train_data['FullDescription'] = train_data['FullDescription'].str.lower()
train_data['FullDescription'] = train_data['FullDescription'].replace('[^a-zA-Z0-9]', ' ', regex=True)
train_data['LocationNormalized'] = train_data['LocationNormalized'].str.lower()
train_data['LocationNormalized'] = train_data['LocationNormalized'].replace('[^a-zA-Z0-9]', ' ', regex=True)
train_data['ContractTime'] = train_data['ContractTime'].str.lower()
train_data['ContractTime'] = train_data['ContractTime'].replace('[^a-zA-Z0-9]', ' ', regex=True)
train_data['LocationNormalized'].fillna('nan', inplace=True)
train_data['ContractTime'].fillna('nan', inplace=True)
In [5]:
train_data.head(10)
Out[5]:
In [6]:
vectorizer = TfidfVectorizer(min_df=5)
In [7]:
description_vectorized = vectorizer.fit_transform(train_data['FullDescription'])
In [8]:
dict_vectorizer = DictVectorizer()
location_and_contract_time_vectorized = dict_vectorizer.fit_transform(train_data[['LocationNormalized', 'ContractTime']].to_dict('records'))
In [9]:
X = hstack([description_vectorized, location_and_contract_time_vectorized])
In [10]:
ridge = Ridge(alpha=1, random_state=241)
In [11]:
ridge.fit(X, train_data['SalaryNormalized'])
Out[11]:
In [12]:
test_data = pd.read_csv('resources/salary-test-mini.csv')
In [13]:
test_data.head(10)
Out[13]:
In [14]:
test_description = test_data['FullDescription'].str.lower()
test_description = test_description.replace('[^a-zA-Z0-9]', ' ', regex=True)
test_location = test_data['LocationNormalized'].str.lower()
test_location = test_location.replace('[^a-zA-Z0-9]', ' ', regex=True)
test_contact_time = test_data['ContractTime'].str.lower()
test_contact_time = test_contact_time.replace('[^a-zA-Z0-9]', ' ', regex=True)
In [15]:
test_description_vectorized = vectorizer.transform(test_description)
test_location_and_contract_time_vectorized = dict_vectorizer.transform([test_location, test_contact_time])
In [16]:
Y = hstack([test_description_vectorized, test_location_and_contract_time_vectorized])
In [17]:
ridge.predict(Y)
Out[17]:
In [18]:
train_data.head(10)
Out[18]: