In [1]:
%pylab inline
import pandas as pd
In [26]:
def rmsle(actual, predicted):
error = np.log1p(predicted) - np.log1p(actual)
return np.sqrt(np.mean(np.square(error)))
In [9]:
train_set = pd.read_csv('../train_set_ym.csv')
In [10]:
train_set.dtypes
Out[10]:
In [305]:
model_data = train_set[train_set.bracket_pricing == 'No']
In [306]:
model_data = model_data.drop(['bracket_pricing'], axis=1)
In [289]:
model_data = pd.get_dummies(model_data, columns=['supplier'])
In [307]:
tube = pd.read_csv('../tube_material_id_imputed_dummies_drop_ns.csv')
In [308]:
comp_weight = pd.read_csv('../comp_weight.csv')
In [309]:
# model_data = pd.merge(model_data, tube, how='left', on='tube_assembly_id')
model_data = pd.merge(model_data, comp_weight, how='left', on='tube_assembly_id')
In [310]:
print model_data.shape
model_data.head()
Out[310]:
In [224]:
import sklearn.cross_validation as skcv
import sklearn.ensemble as sken
import sklearn.grid_search as skgs
In [311]:
model_train, model_test = skcv.train_test_split(model_data, random_state=5, train_size=0.7)
In [283]:
# skcv.KFold()
In [312]:
print model_train.shape
print model_test.shape
In [295]:
rf = sken.RandomForestRegressor(n_estimators=100, max_features=5)
# rf = sken.ExtraTreesRegressor(n_estimators=50)
# rf = sken.GradientBoostingRegressor(learning_rate=0.25, n_estimators=200)
In [313]:
model_train_X = model_train.drop(['tube_assembly_id', 'cost'], axis=1)
rf = rf.fit(model_train_X, np.log1p(model_train.cost))
In [297]:
model_train_pred = np.expm1(rf.predict(model_train_X))
print rmsle(model_train.cost, model_train_pred)
In [298]:
model_train_actual_fit = pd.DataFrame({'actual' : log1p(model_train.cost), 'fit' : log1p(model_train_pred)})
In [299]:
model_feature_importance = pd.DataFrame({'feature' : model_train_X.columns, 'score' : rf.feature_importances_})
In [304]:
model_feature_importance
Out[304]:
In [300]:
important_feats = model_feature_importance.sort(columns=['score'], ascending=False).head(12).feature.values
In [303]:
important_feats
Out[303]:
In [301]:
model_train_actual_fit.plot(x='actual', y='fit', kind='scatter')
plt.show()
In [302]:
model_test_X = model_test.drop(['tube_assembly_id', 'cost'], axis=1)
model_test_pred = np.expm1(rf.predict(model_test_X))
print rmsle(model_test.cost, model_test_pred)
In [268]:
model_test_actual_fit = pd.DataFrame({'actual' : log1p(model_test.cost), 'fit' : log1p(model_test_pred)})
In [269]:
model_test_actual_fit.plot(x='actual', y='fit', kind='scatter')
plt.show()
In [270]:
model_train_X_imps = model_train[important_feats]
rf = rf.fit(model_train_X_imps, np.log1p(model_train.cost))
In [271]:
model_train_imps_pred = np.expm1(rf.predict(model_train_X_imps))
print rmsle(model_train.cost, model_train_imps_pred)
In [272]:
model_test_X = model_test.drop(['tube_assembly_id', 'cost'], axis=1)
model_test_X_imps = model_test_X[important_feats]
model_test_pred = np.expm1(rf.predict(model_test_X_imps))
print rmsle(model_test.cost, model_test_pred)
In [273]:
rf = sken.RandomForestRegressor(n_estimators=100, max_features=5, random_state=0)
gridcv = skgs.GridSearchCV(rf, param_grid={'n_estimators' : [50, 100, 200, 500]})
In [274]:
gridcv.fit(model_train_X, log1p(model_train.cost))
Out[274]:
In [275]:
gridcv.best_estimator_
Out[275]:
In [276]:
model_train_pred = expm1(gridcv.predict(model_train_X))
print rmsle(model_train.cost, model_train_pred)
In [277]:
model_test_X = model_test.drop(['tube_assembly_id', 'cost'], axis=1)
model_test_pred = expm1(gridcv.predict(model_test_X))
print rmsle(model_test.cost, model_test_pred)