In [2]:
%pylab inline
import pandas as pd
import sklearn.ensemble as sk_ensemble
import sklearn.cross_validation as sk_cv
import xgboost as xgb


Populating the interactive namespace from numpy and matplotlib

In [3]:
def rmsle(actual, predicted):
    error = np.log1p(predicted) - np.log1p(actual)
    return np.sqrt(np.mean(np.square(error)))

In [4]:
train_df = pd.read_csv('../train_set_adjusted.csv')

In [45]:
# print train_df.shape
# print train_df.columns

In [6]:
tube = pd.read_csv('../tube_material_id_imputed_dummies_drop_ns.csv')

In [46]:
# print tube.shape
# print tube.columns

In [8]:
spec = pd.read_csv('../spec_dummies.csv')

In [47]:
# print spec.shape
# print spec.columns

In [10]:
comp = pd.read_csv('../comp_type_dummies.csv')

In [48]:
# print comp.shape
# print comp.columns

In [12]:
comp_type_weight = pd.read_csv('../comp_type_weight.csv')

In [49]:
# print comp_type_weight.shape
# print comp_type_weight.columns

In [14]:
tube_vol = pd.read_csv('../tube_volume.csv')

In [15]:
train = pd.merge(train_df, tube)
train = pd.merge(train, spec)
train = pd.merge(train, comp_type_weight)
train = pd.merge(train, tube_vol)

In [30]:
train.shape


Out[30]:
(30213, 264)

In [68]:
train_sub_train, train_sub_cv = sk_cv.train_test_split(train.ix[:29000], train_size = 0.5, random_state = 346)
train_sub_test = train.ix[29000:]

In [69]:
print train_sub_train.shape
print train_sub_cv.shape


(14500, 264)
(14501, 264)

In [81]:
X = train_sub_train.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
Y = train_sub_train.cost
# , random_state=0, verbose=0
rf = sk_ensemble.RandomForestRegressor(n_estimators=500, n_jobs=4, oob_score=True)
rf = rf.fit(X, np.log1p(Y))

In [82]:
X_cv = train_sub_cv.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_cv = train_sub_cv.cost
X_test = train_sub_test.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_test = train_sub_test.cost
y_cv_fitted = np.expm1(rf.predict(X_cv))

In [83]:
print rmsle(y_cv, y_cv_fitted)
print y_cv[:10]
print y_cv_fitted[:10]


0.248358061663
8768      4.657167
17279    21.115828
11098     7.578329
13415    13.033947
26622    11.843225
23383     8.696717
25047     6.754839
28684     1.986390
26369    21.892023
22634    18.687091
Name: cost, dtype: float64
[  4.78016727  22.18499406   7.70401672  13.63151068  11.24795093
  10.80556044   6.64042232   2.15777853  21.26286314  15.32715093]

In [73]:
kaggle_test_df.head()


Out[73]:
annual_usage cost min_order_quantity quantity quantity_rep quote_date tube_assembly_id year supplier_S-0003 supplier_S-0004 ... CP-022 CP-023 CP-024 CP-025 CP-026 CP-027 CP-028 CP-029 OTHER tube_volume
0 0 0 0 1 1.00 2013-06-23 TA-00001 2013 0 0 ... 0 0 0.024 0.072 0 0 0 0 0 81697.336868
1 0 0 0 2 0.50 2013-06-23 TA-00001 2013 0 0 ... 0 0 0.024 0.072 0 0 0 0 0 81697.336868
2 0 0 0 5 0.20 2013-06-23 TA-00001 2013 0 0 ... 0 0 0.024 0.072 0 0 0 0 0 81697.336868
3 0 0 0 10 0.10 2013-06-23 TA-00001 2013 0 0 ... 0 0 0.024 0.072 0 0 0 0 0 81697.336868
4 0 0 0 25 0.04 2013-06-23 TA-00001 2013 0 0 ... 0 0 0.024 0.072 0 0 0 0 0 81697.336868

5 rows × 265 columns


In [84]:
kaggle_test = pd.read_csv('../test_dummies_adjusted.csv')
kaggle_test_df = pd.merge(kaggle_test, tube)
kaggle_test_df = pd.merge(kaggle_test_df, spec)
kaggle_test_df = pd.merge(kaggle_test_df, comp_type_weight)
kaggle_test_df = pd.merge(kaggle_test_df, tube_vol)
kaggle_test_vals = kaggle_test_df.drop(['tube_assembly_id', 'quote_date', 'cost', 'id'], axis=1).values
preds = np.expm1(rf.predict(kaggle_test_vals))

In [85]:
preds[:10]


Out[85]:
array([ 28.62403488,  24.8836941 ,  20.91606194,  10.27019523,
         8.71596676,   8.64892847,   8.61588627,   8.59260788,
        28.59496226,  20.44210586])

In [ ]: