notebook.community

Edit and run



In [2]:

    
%pylab inline
import pandas as pd
import sklearn.ensemble as sk_ensemble
import sklearn.cross_validation as sk_cv
import xgboost as xgb









    



Populating the interactive namespace from numpy and matplotlib



In [3]:

    
def rmsle(actual, predicted):
    error = np.log1p(predicted) - np.log1p(actual)
    return np.sqrt(np.mean(np.square(error)))



In [4]:

    
train_df = pd.read_csv('../train_set_adjusted.csv')



In [45]:

    
# print train_df.shape
# print train_df.columns



In [6]:

    
tube = pd.read_csv('../tube_material_id_imputed_dummies_drop_ns.csv')



In [46]:

    
# print tube.shape
# print tube.columns



In [8]:

    
spec = pd.read_csv('../spec_dummies.csv')



In [47]:

    
# print spec.shape
# print spec.columns



In [10]:

    
comp = pd.read_csv('../comp_type_dummies.csv')



In [48]:

    
# print comp.shape
# print comp.columns



In [12]:

    
comp_type_weight = pd.read_csv('../comp_type_weight.csv')



In [49]:

    
# print comp_type_weight.shape
# print comp_type_weight.columns



In [14]:

    
tube_vol = pd.read_csv('../tube_volume.csv')



In [15]:

    
train = pd.merge(train_df, tube)
train = pd.merge(train, spec)
train = pd.merge(train, comp_type_weight)
train = pd.merge(train, tube_vol)



In [30]:

    
train.shape









    Out[30]:





(30213, 264)



In [68]:

    
train_sub_train, train_sub_cv = sk_cv.train_test_split(train.ix[:29000], train_size = 0.5, random_state = 346)
train_sub_test = train.ix[29000:]



In [69]:

    
print train_sub_train.shape
print train_sub_cv.shape









    



(14500, 264)
(14501, 264)



In [81]:

    
X = train_sub_train.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
Y = train_sub_train.cost
# , random_state=0, verbose=0
rf = sk_ensemble.RandomForestRegressor(n_estimators=500, n_jobs=4, oob_score=True)
rf = rf.fit(X, np.log1p(Y))



In [82]:

    
X_cv = train_sub_cv.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_cv = train_sub_cv.cost
X_test = train_sub_test.drop(['tube_assembly_id', 'quote_date', 'cost'], axis=1).values
y_test = train_sub_test.cost
y_cv_fitted = np.expm1(rf.predict(X_cv))



In [83]:

    
print rmsle(y_cv, y_cv_fitted)
print y_cv[:10]
print y_cv_fitted[:10]









    



0.248358061663
8768      4.657167
17279    21.115828
11098     7.578329
13415    13.033947
26622    11.843225
23383     8.696717
25047     6.754839
28684     1.986390
26369    21.892023
22634    18.687091
Name: cost, dtype: float64
[  4.78016727  22.18499406   7.70401672  13.63151068  11.24795093
  10.80556044   6.64042232   2.15777853  21.26286314  15.32715093]



In [73]:

    
kaggle_test_df.head()









    Out[73]:






  
    
      
      annual_usage
      cost
      min_order_quantity
      quantity
      quantity_rep
      quote_date
      tube_assembly_id
      year
      supplier_S-0003
      supplier_S-0004
      ...
      CP-022
      CP-023
      CP-024
      CP-025
      CP-026
      CP-027
      CP-028
      CP-029
      OTHER
      tube_volume
    
  
  
    
      0
      0
      0
      0
      1
      1.00
      2013-06-23
      TA-00001
      2013
      0
      0
      ...
      0
      0
      0.024
      0.072
      0
      0
      0
      0
      0
      81697.336868
    
    
      1
      0
      0
      0
      2
      0.50
      2013-06-23
      TA-00001
      2013
      0
      0
      ...
      0
      0
      0.024
      0.072
      0
      0
      0
      0
      0
      81697.336868
    
    
      2
      0
      0
      0
      5
      0.20
      2013-06-23
      TA-00001
      2013
      0
      0
      ...
      0
      0
      0.024
      0.072
      0
      0
      0
      0
      0
      81697.336868
    
    
      3
      0
      0
      0
      10
      0.10
      2013-06-23
      TA-00001
      2013
      0
      0
      ...
      0
      0
      0.024
      0.072
      0
      0
      0
      0
      0
      81697.336868
    
    
      4
      0
      0
      0
      25
      0.04
      2013-06-23
      TA-00001
      2013
      0
      0
      ...
      0
      0
      0.024
      0.072
      0
      0
      0
      0
      0
      81697.336868
    
  

5 rows × 265 columns



In [84]:

    
kaggle_test = pd.read_csv('../test_dummies_adjusted.csv')
kaggle_test_df = pd.merge(kaggle_test, tube)
kaggle_test_df = pd.merge(kaggle_test_df, spec)
kaggle_test_df = pd.merge(kaggle_test_df, comp_type_weight)
kaggle_test_df = pd.merge(kaggle_test_df, tube_vol)
kaggle_test_vals = kaggle_test_df.drop(['tube_assembly_id', 'quote_date', 'cost', 'id'], axis=1).values
preds = np.expm1(rf.predict(kaggle_test_vals))



In [85]:

    
preds[:10]









    Out[85]:





array([ 28.62403488,  24.8836941 ,  20.91606194,  10.27019523,
         8.71596676,   8.64892847,   8.61588627,   8.59260788,
        28.59496226,  20.44210586])



In [ ]:

	quantity	quantity_rep	quote_date	tube_assembly_id	year	...	CP-024	CP-025	tube_volume
0	1	1.00	2013-06-23	TA-00001	2013	...	0.024	0.072	81697.336868
1	2	0.50	2013-06-23	TA-00001	2013	...	0.024	0.072	81697.336868
2	5	0.20	2013-06-23	TA-00001	2013	...	0.024	0.072	81697.336868
3	10	0.10	2013-06-23	TA-00001	2013	...	0.024	0.072	81697.336868
4	25	0.04	2013-06-23	TA-00001	2013	...	0.024	0.072	81697.336868