Featurizing bill of materials part 2: bring in component specs as multiple columns (and lose ID)



In [1]:

    
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing, ensemble, cross_validation, grid_search
from sklearn.ensemble import RandomForestRegressor
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation
import xgboost as xgb



In [2]:

    
print os.listdir('.')
directory = os.path.join(os.getcwd(),'competition_data')
filename = 'train_set.csv'
path = os.path.join(directory,filename)
print path









    



['.ipynb_checkpoints', 'benchmark.csv', 'competition_data', 'Feat2OheXgb.ipynb', 'Feat2wXGB.ipynb', 'Feat3wXGB.ipynb', 'MoreFeatures.ipynb', 'MoreFeatures2.ipynb', 'Prelim.ipynb']
C:\Users\Victor\Desktop\Kaggle\Caterpillar\competition_data\train_set.csv



In [3]:

    
train = pd.read_csv(os.path.join(directory,'train_set.csv'), parse_dates=[2,])
test = pd.read_csv(os.path.join(directory,'test_set.csv'), parse_dates=[3,])



In [4]:

    
tubes = pd.read_csv(os.path.join(directory,'tube.csv'))

train = pd.merge(train,tubes,on='tube_assembly_id',how='inner')
test = pd.merge(test,tubes,on='tube_assembly_id',how='inner')

train['material_id'].fillna('SP-9999',inplace=True)
test['material_id'].fillna('SP-9999',inplace=True)



In [5]:

    
print train.shape









    



(30213, 23)

Featurizing Bill of Materials



In [6]:

    
materials = pd.read_csv(os.path.join(directory,'bill_of_materials_with_labels.csv'))



In [7]:

    
materials.columns









    Out[7]:





Index([u'tube_assembly_id', u'component_id', u'label'], dtype='object')



In [8]:

    
comp_filenames = ['comp_adaptor.csv',
              'comp_boss.csv',
              'comp_elbow.csv',
              'comp_float.csv',
              'comp_hfl.csv',
              'comp_nut.csv',
              'comp_other.csv',
              'comp_sleeve.csv',
              'comp_straight.csv',
              'comp_tee.csv',
              'comp_threaded.csv']

comp_files = [pd.read_csv(os.path.join(directory,filename)) for filename in comp_filenames]



In [9]:

    
print comp_files[1].shape
testing1 = np.array(comp_files[1])
print testing1.shape









    



(147, 15)
(147L, 15L)



In [11]:

    
#try onehotencoding at spreadsheet input level (don't have dimensional interaction problem)
for ind in range(len(comp_files)):
    testFile = comp_files[ind]
    testFile.fillna(0,inplace=True)
    testFile = np.array(testFile)
    rangeset = testFile.shape[1]
    for i in range(3,rangeset):
        #if type(sorted(train[:,i],reverse=True)[1]) is str:
        if any(type(x) is str for x in testFile[:,i]):
            testFile[:,i][testFile[:,i]==0] = 'zzz'
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(testFile[:,i]))
            testFile[:,i] = lbl.transform(testFile[:,i])
            encoder = preprocessing.OneHotEncoder()
            encoder.fit([testFile[:,i]])
            testFile[:,i] = encoder.transform([testFile[:,i]]).toarray()
    testFile = pd.DataFrame(testFile)
    comp_files[ind]=testFile.rename(columns = {0:'component_id'})



In [12]:

    
print comp_files[1].head()









    



  component_id       1     2  3  4  5     6  7  8  9  10  11 12 13     14
0       C-0008  CP-018  Boss  1  1  1    17  0  0  1  22   0  1  1  0.032
1       C-0009  CP-018  Boss  1  1  1    13  0  0  1  25   0  1  1  0.033
2       C-0020  CP-018  Boss  1  1  1  28.4  0  0  1  35   0  1  1   0.07
3       C-0054  CP-018  Boss  1  1  1  27.1  0  0  1   0   0  1  1   0.18
4       C-0071  CP-018  Boss  1  1  1    20  0  0  1  30  23  1  1   0.08



In [13]:

    
for i in range(1,7):
    #filter by instances
    filtered_materials = materials[materials['label']==i].drop('label',1)
    for comp_file in comp_files:
        #join by component type files
        mat_comp = pd.merge(filtered_materials,comp_file,on='component_id', how='inner')
        #join onto train and test sets
        if mat_comp.shape[0]!=0:
            train = pd.merge(train,mat_comp,on='tube_assembly_id', how='left')
            test = pd.merge(test,mat_comp,on='tube_assembly_id', how='left')



In [14]:

    
print train.shape
print test.shape









    



(30213, 440)
(30235, 440)



In [15]:

    
train.fillna(0,inplace=True)
test.fillna(0,inplace=True)

End Bill of Materials stuff



In [16]:

    
# drop useless columns and create labels
idx = test.id.values.astype(int)
test = test.drop(['id', 'tube_assembly_id', 'quote_date'], axis = 1)
labels = train.cost.values
train = train.drop(['quote_date', 'cost', 'tube_assembly_id'], axis = 1)



In [17]:

    
# convert data to numpy array
train = np.array(train)
test = np.array(test)



In [18]:

    
[list(train[:,1]),list(test[:,1])]
print len(train[:,1])
print len(test[:,1])



In [19]:

    
rangeset = train.shape[1]
for i in range(rangeset):
    #if type(sorted(train[:,i],reverse=True)[1]) is str:
    if any(type(x) is str for x in train[:,i]) or any(type(x) is str for x in test[:,i]):
        train[:,i][train[:,i]==0] = 'zzz'
        test[:,i][test[:,i]==0] = 'zzz'
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[:,i]) + list(test[:,i]))
        train[:,i] = lbl.transform(train[:,i])
        test[:,i] = lbl.transform(test[:,i])
        """encoder = preprocessing.OneHotEncoder()
        combined = [list(train[:,1]),list(test[:,1])]
        encoder.fit(combined)
        train[:,i] = encoder.transform([train[:,i]])
        test[:,i] = encoder.transform([test[:,i]])"""



In [20]:

    
print(train[0:5,:])

print(test[0:5,:])









    



[[41L 0L 0L ..., 4L 3L 0.0]
 [41L 0L 0L ..., 4L 3L 0.0]
 [41L 0L 0L ..., 4L 3L 0.0]
 [41L 0L 0L ..., 4L 3L 0.0]
 [41L 0L 0L ..., 4L 3L 0.0]]
[[41L 0L 0L ..., 4L 3L 0.0]
 [41L 0L 0L ..., 4L 3L 0.0]
 [41L 0L 0L ..., 4L 3L 0.0]
 [41L 0L 0L ..., 4L 3L 0.0]
 [41L 0L 0L ..., 4L 3L 0.0]]



In [21]:

    
# object array to float
train = train.astype(float)
test = test.astype(float)

# i like to train on log(1+x) for RMSLE ;) 
# The choice is yours :)
label_log = np.log1p(labels)



In [22]:

    
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, label_log, test_size = 0.2, random_state = 0)



In [23]:

    
#RMSLE error function
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
	assert len(y) == len(y_pred)
	terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
	return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

from sklearn.metrics import make_scorer

custom_score = make_scorer(rmsle,False)



In [24]:

    
"""# sci-kit random forests

parameters = {'n_estimators': [1000] }

model = RandomForestRegressor(n_estimators=100)

#model = GridSearchCV(estimator= rfr, param_grid=parameters, )
#model.fit(X_train, y_train)
model.fit(train,label_log)"""









    Out[24]:





"# sci-kit random forests\n\nparameters = {'n_estimators': [1000] }\n\nmodel = RandomForestRegressor(n_estimators=100)\n\n#model = GridSearchCV(estimator= rfr, param_grid=parameters, )\n#model.fit(X_train, y_train)\nmodel.fit(train,label_log)"



In [25]:

    
# xgboost random forest model

params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["min_child_weight"] = 5
params["subsample"] = 1.0
params["scale_pos_weight"] = 1.0
params["silent"] = 1
params["max_depth"] = 7

plst = list(params.items())

xgtrain = xgb.DMatrix(X_train, label = y_train)
xgtest = xgb.DMatrix(X_test)
#xgtrain = xgb.DMatrix(train, label=label_log)
#xgtest = xgb.DMatrix(test)

num_rounds = 500
model = xgb.train(plst, xgtrain, num_rounds)



In [26]:

    
#val_preds = np.expm1(model.predict(X_test))
val_preds = np.expm1(model.predict(xgtest))
norm_labels = np.expm1(y_test)
score_val = rmsle(norm_labels,val_preds)
print 'Baseline score is 0.207 with label features'
print score_val









    



Baseline score is 0.207 with label features
0.209879227078



In [39]:

    
#preds = np.expm1(model.predict(test))
preds = np.expm1(model.predict(xgtest))



In [40]:

    
preds = pd.DataFrame({"id": idx, "cost": preds})



In [41]:

    
preds.to_csv('benchmark.csv', index=False)

Looking into the data.

An analysis into the relationships between the different data files:

test_set is root

sheets with tube assembly ids:
bill_of_materials; components for every tube assembly
specs; specifications for every tube assembly
tube; description for every tube, including spec id, size and bend information, end of tube info

sheets without tube assembly ids with component ids:

components; component type list summary

comp_adaptor; detailed component specs, adaptor type
comp_boss; detailed component specs, boss type
comp_elbow; detailed component specs, elbow type
comp_float; detailed component specs, float type
comp_hfl; detailed component specs, hfl type
comp_nut; detailed component specs, nut type
comp_other; detailed component specs, other type
comp_sleeve; detailed component specs, sleeve type
comp_straight; detailed component specs, straight type
comp_tee; detailed component specs, tee type
comp_threaded; detailed component specs, threaded type

sheets without tube assembly ids with other ids
tube_end_form; end of tube types, yes/no on forming
type_component; component type id with name
type_connection; connection type id (in some components) with name
type_end_form; end form id (in some components) with name