In [1]:
%pylab inline
import pandas as pd
import sklearn.ensemble as sk_ensemble
import sklearn.cross_validation as sk_cv
import xgboost as xgb
In [2]:
def rmsle(actual, predicted):
error = np.log1p(predicted) - np.log1p(actual)
return np.sqrt(np.mean(np.square(error)))
In [86]:
train = pd.read_csv('../preprocessed_train.csv')
In [87]:
train.head(10)
Out[87]:
In [10]:
bracket_pricing_ta_ids = train[train.bracket_pricing_Yes == 1].tube_assembly_id.unique()
non_bracket_pricing_ta_ids = train[train.bracket_pricing_Yes != 1].tube_assembly_id.unique()
In [11]:
print bracket_pricing_ta_ids.shape
print non_bracket_pricing_ta_ids.shape
In [88]:
print train[train.bracket_pricing_Yes == 1].shape
print train[train.bracket_pricing_Yes != 1].shape
In [13]:
bp_ids_train_cv, bp_ids_test = \
sk_cv.train_test_split(bracket_pricing_ta_ids, train_size = 0.7, random_state = 0)
In [23]:
print bp_ids_train_cv.shape
print bp_ids_test.shape
print type(bp_ids_train_cv)
In [14]:
nbp_ids_train_cv, nbp_ids_test = \
sk_cv.train_test_split(non_bracket_pricing_ta_ids, train_size = 0.7, random_state = 0)
In [26]:
train_cv = train.ix[train.tube_assembly_id.isin(bp_ids_train_cv.tolist() + nbp_ids_train_cv.tolist()), ]
holdout = train.ix[train.tube_assembly_id.isin(bp_ids_test.tolist() + nbp_ids_test.tolist()), ]
In [27]:
print train_cv.shape
print holdout.shape
In [70]:
kfold_bp = sk_cv.KFold(bracket_pricing_ta_ids.shape[0], 10, shuffle=True, random_state=0)
kfold_nbp = sk_cv.KFold(non_bracket_pricing_ta_ids.shape[0], 10, shuffle=True, random_state=0)
In [95]:
# print kfold_bp
# print kfold_nbp
In [96]:
for train_kf, test_kf in kfold_bp:
ta_ids_in_train = bracket_pricing_ta_ids[train_kf, ].tolist()
ta_ids_in_cv = bracket_pricing_ta_ids[test_kf, ].tolist()
train_df = train.ix[train.tube_assembly_id.isin(ta_ids_in_train), ]
cv_df = train.ix[train.tube_assembly_id.isin(ta_ids_in_cv), ]
X = train_df.drop(['tube_assembly_id', 'cost'], axis=1).values
y = train_df.cost
rf = sk_ensemble.RandomForestRegressor(n_estimators=50, n_jobs=4, oob_score=True)
rf = rf.fit(X, np.log1p(y))
holdout_X = cv_df.drop(['tube_assembly_id', 'cost'], axis=1).values
holdout_y = cv_df.cost
holdout_y_fitted = np.expm1(rf.predict(holdout_X))
print rmsle(holdout_y, holdout_y_fitted)
In [100]:
for train_kf, test_kf in kfold_nbp:
ta_ids_in_train = non_bracket_pricing_ta_ids[train_kf, ].tolist()
ta_ids_in_cv = non_bracket_pricing_ta_ids[test_kf, ].tolist()
train_df = train.ix[train.tube_assembly_id.isin(ta_ids_in_train), ]
cv_df = train.ix[train.tube_assembly_id.isin(ta_ids_in_cv), ]
X = train_df.drop(['tube_assembly_id', 'cost'], axis=1).values
y = train_df.cost
rf = sk_ensemble.RandomForestRegressor(n_estimators=250, n_jobs=4, oob_score=True)
rf = rf.fit(X, np.log1p(y))
y_fitted = np.expm1(rf.predict(X))
print 'in sample: %f.2' % rmsle(y_fitted, y)
holdout_X = cv_df.drop(['tube_assembly_id', 'cost'], axis=1).values
holdout_y = cv_df.cost
holdout_y_fitted = np.expm1(rf.predict(holdout_X))
print 'cv: %f.2' %rmsle(holdout_y, holdout_y_fitted)
In [68]:
# train_sub_train, train_sub_cv = sk_cv.train_test_split(train.ix[:29000], train_size = 0.5, random_state = 346)
# train_sub_test = train.ix[29000:]
In [54]:
X = train_cv.drop(['tube_assembly_id', 'cost'], axis=1).values
y = train_cv.cost
# , random_state=0, verbose=0
rf = sk_ensemble.RandomForestRegressor(n_estimators=250, n_jobs=4, oob_score=True)
rf = rf.fit(X, np.log1p(y))
In [57]:
y_fitted = np.expm1(rf.predict(X))
print rmsle(y, y_fitted)
In [55]:
holdout_X = holdout.drop(['tube_assembly_id', 'cost'], axis=1).values
holdout_y = holdout.cost
holdout_y_fitted = np.expm1(rf.predict(holdout_X))
In [56]:
print rmsle(holdout_y, holdout_y_fitted)
In [38]:
feats_imp = pd.DataFrame({'feats' : train_cv.drop(['tube_assembly_id', 'cost'], axis=1).columns, \
'imp' : rf.feature_importances_})
In [94]:
feats_imp.sort(columns=['imp'], ascending=False).head(50).plot(kind='bar')
Out[94]:
In [82]:
X_cv = train_sub_cv.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_cv = train_sub_cv.cost
X_test = train_sub_test.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_test = train_sub_test.cost
y_cv_fitted = np.expm1(rf.predict(X_cv))
In [83]:
print rmsle(y_cv, y_cv_fitted)
print y_cv[:10]
print y_cv_fitted[:10]
In [73]:
kaggle_test_df.head()
Out[73]:
In [84]:
kaggle_test = pd.read_csv('../test_dummies_adjusted.csv')
kaggle_test_df = pd.merge(kaggle_test, tube)
kaggle_test_df = pd.merge(kaggle_test_df, spec)
kaggle_test_df = pd.merge(kaggle_test_df, comp_type_weight)
kaggle_test_df = pd.merge(kaggle_test_df, tube_vol)
kaggle_test_vals = kaggle_test_df.drop(['tube_assembly_id', 'quote_date', 'cost', 'id'], axis=1).values
preds = np.expm1(rf.predict(kaggle_test_vals))
In [85]:
preds[:10]
Out[85]:
In [ ]: