** First pass for processing, reads in data from beat the benchmark, runs random forest regressor with cross validation scoring and creates upload file


In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing, ensemble, cross_validation, grid_search
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV

In [2]:
print os.listdir('.')
directory = os.path.join(os.getcwd(),'competition_data')
filename = 'train_set.csv'
path = os.path.join(directory,filename)
print path


['.ipynb_checkpoints', 'benchmark.csv', 'competition_data', 'Prelim.ipynb']
C:\Users\Victor\Desktop\Kaggle\Caterpillar\competition_data\train_set.csv

In [3]:
train = pd.read_csv(os.path.join(directory,'train_set.csv'), parse_dates=[2,])
test = pd.read_csv(os.path.join(directory,'test_set.csv'), parse_dates=[3,])

In [4]:
tubes = pd.read_csv(os.path.join(directory,'tube.csv'))

train = pd.merge(train,tubes,on='tube_assembly_id',how='inner')
test = pd.merge(test,tubes,on='tube_assembly_id',how='inner')

train['material_id'].fillna('SP-9999',inplace=True)
test['material_id'].fillna('SP-9999',inplace=True)

In [5]:
# drop useless columns and create labels
idx = test.id.values.astype(int)
test = test.drop(['id', 'tube_assembly_id', 'quote_date'], axis = 1)
labels = train.cost.values
train = train.drop(['quote_date', 'cost', 'tube_assembly_id'], axis = 1)

In [6]:
# convert data to numpy array
train = np.array(train)
test = np.array(test)

In [7]:
# label encode the categorical variables
rangeset = train.shape[1]
for i in range(rangeset):
    #print(i)
    if i in [0,3,5, 11,12,13,14,15,16]:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[:,i]) + list(test[:,i]))
        train[:,i] = lbl.transform(train[:,i])
        test[:,i] = lbl.transform(test[:,i])

In [8]:
print(train[0:5,:])

print(test[0:5,:])


[[41L 0L 0L 1L 1L 1L 6.35 0.71 137.0 8L 19.05 0L 0L 0L 0L 7L 8L 0L 0L 0L]
 [41L 0L 0L 1L 2L 1L 6.35 0.71 137.0 8L 19.05 0L 0L 0L 0L 7L 8L 0L 0L 0L]
 [41L 0L 0L 1L 5L 1L 6.35 0.71 137.0 8L 19.05 0L 0L 0L 0L 7L 8L 0L 0L 0L]
 [41L 0L 0L 1L 10L 1L 6.35 0.71 137.0 8L 19.05 0L 0L 0L 0L 7L 8L 0L 0L 0L]
 [41L 0L 0L 1L 25L 1L 6.35 0.71 137.0 8L 19.05 0L 0L 0L 0L 7L 8L 0L 0L 0L]]
[[41L 0L 0L 1L 1L 9L 12.7 1.65 164.0 5L 38.1 0L 0L 0L 0L 2L 3L 0L 0L 0L]
 [41L 0L 0L 1L 2L 9L 12.7 1.65 164.0 5L 38.1 0L 0L 0L 0L 2L 3L 0L 0L 0L]
 [41L 0L 0L 1L 5L 9L 12.7 1.65 164.0 5L 38.1 0L 0L 0L 0L 2L 3L 0L 0L 0L]
 [41L 0L 0L 1L 10L 9L 12.7 1.65 164.0 5L 38.1 0L 0L 0L 0L 2L 3L 0L 0L 0L]
 [41L 0L 0L 1L 25L 9L 12.7 1.65 164.0 5L 38.1 0L 0L 0L 0L 2L 3L 0L 0L 0L]]

In [9]:
# object array to float
train = train.astype(float)
test = test.astype(float)

# i like to train on log(1+x) for RMSLE ;) 
# The choice is yours :)
label_log = np.log1p(labels)

In [10]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, label_log, test_size = 0.2, random_state = 0)

In [11]:
#RMSLE error function
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
	assert len(y) == len(y_pred)
	terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
	return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

from sklearn.metrics import make_scorer

custom_score = make_scorer(rmsle,False)

In [18]:
# try cross validation
parameters = {'n_estimators': [1000] }

rfr = RandomForestRegressor()

model = GridSearchCV(estimator= rfr, param_grid=parameters, scoring= custom_score)
model.fit(X_train, y_train)


Out[18]:
GridSearchCV(cv=None,
       estimator=RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False,
           random_state=None, verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'n_estimators': [1000]}, pre_dispatch='2*n_jobs',
       refit=True, score_func=None,
       scoring=make_scorer(rmsle, greater_is_better=False), verbose=0)

In [19]:
val_preds = np.expm1(model.predict(X_test))
norm_labels = np.expm1(y_test)
score_val = rmsle(norm_labels,val_preds)
print score_val


0.272290819497

In [14]:
preds = np.expm1(model.predict(test))

In [15]:
preds = pd.DataFrame({"id": idx, "cost": preds})

In [16]:
preds.to_csv('benchmark.csv', index=False)

Looking into the data.

An analysis into the relationships between the different data files:

test_set is root

sheets with tube assembly ids:
bill_of_materials; components for every tube assembly
specs; specifications for every tube assembly
tube; description for every tube, including spec id, size and bend information, end of tube info

sheets without tube assembly ids with component ids:

  • components; component type list summary

comp_adaptor; detailed component specs, adaptor type
comp_boss; detailed component specs, boss type
comp_elbow; detailed component specs, elbow type
comp_float; detailed component specs, float type
comp_hfl; detailed component specs, hfl type
comp_nut; detailed component specs, nut type
comp_other; detailed component specs, other type
comp_sleeve; detailed component specs, sleeve type
comp_straight; detailed component specs, straight type
comp_tee; detailed component specs, tee type
comp_threaded; detailed component specs, threaded type

sheets without tube assembly ids with other ids
tube_end_form; end of tube types, yes/no on forming
type_component; component type id with name
type_connection; connection type id (in some components) with name
type_end_form; end form id (in some components) with name

Toy around some other algorithms


In [20]:
#try support vector machines on existing data to see what happens

from sklearn import svm

model2 = svm.SVR()
model2.fit(X_train,y_train)


Out[20]:
SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [21]:
second_preds = np.expm1(model2.predict(X_test))
norm_labels = np.expm1(y_test)
score_val = rmsle(norm_labels,second_preds)
print score_val


0.576419092814

Wow! Quite a bit worse, 0.57 vs 0.27

Featurizing Bill of Materials

There are 2048 components, this will create an enormous number of features...but I think I'll try. Can't do a pivot in excel but am going to try in pandas


In [22]:
materials = pd.read_csv(os.path.join(directory,'bill_of_materials_categorized.csv'))

In [23]:
materials.head()


Out[23]:
tube_assembly_id component_id_1 quantity_1
0 TA-00001 C-1622 2
1 TA-00002 C-1312 2
2 TA-00003 C-1312 2
3 TA-00004 C-1312 2
4 TA-00005 C-1624 1

In [65]:
materialsTable = pd.pivot_table(materials,values='quantity_1',index='tube_assembly_id',columns='component_id_1',aggfunc=np.sum, fill_value=0)

In [83]:
materialsTable.index


Out[83]:
Index([u'TA-00001', u'TA-00002', u'TA-00003', u'TA-00004', u'TA-00005', u'TA-00006', u'TA-00007', u'TA-00008', u'TA-00009', u'TA-00010', u'TA-00011', u'TA-00012', u'TA-00013', u'TA-00014', u'TA-00015', u'TA-00016', u'TA-00017', u'TA-00018', u'TA-00020', u'TA-00021', u'TA-00022', u'TA-00023', u'TA-00024', u'TA-00025', u'TA-00026', u'TA-00027', u'TA-00028', u'TA-00029', u'TA-00030', u'TA-00031', u'TA-00032', u'TA-00033', u'TA-00034', u'TA-00035', u'TA-00036', u'TA-00037', u'TA-00038', u'TA-00039', u'TA-00040', u'TA-00041', u'TA-00042', u'TA-00043', u'TA-00045', u'TA-00046', u'TA-00047', u'TA-00048', u'TA-00049', u'TA-00050', u'TA-00051', u'TA-00052', u'TA-00053', u'TA-00054', u'TA-00055', u'TA-00056', u'TA-00057', u'TA-00058', u'TA-00059', u'TA-00060', u'TA-00061', u'TA-00062', u'TA-00063', u'TA-00065', u'TA-00066', u'TA-00067', u'TA-00068', u'TA-00069', u'TA-00071', u'TA-00072', u'TA-00073', u'TA-00075', u'TA-00076', u'TA-00077', u'TA-00078', u'TA-00079', u'TA-00080', u'TA-00081', u'TA-00082', u'TA-00083', u'TA-00084', u'TA-00085', u'TA-00086', u'TA-00087', u'TA-00088', u'TA-00089', u'TA-00090', u'TA-00091', u'TA-00092', u'TA-00093', u'TA-00094', u'TA-00095', u'TA-00096', u'TA-00097', u'TA-00098', u'TA-00099', u'TA-00100', u'TA-00102', u'TA-00103', u'TA-00104', u'TA-00105', u'TA-00107', ...], dtype='object')

Be smart about this need to build training data from scratch and then relink to this table above and see how to do it. "tube_assembly_id" had been dropped. So unclear what is the best way to look at this.