In [39]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing, ensemble, cross_validation, grid_search
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation, svm
from sklearn.linear_model import LinearRegression, BayesianRidge, SGDRegressor
import xgboost as xgb
In [2]:
print os.listdir('.')
directory = os.path.join(os.getcwd(),'competition_data')
filename = 'train_set.csv'
path = os.path.join(directory,filename)
print path
In [3]:
train = pd.read_csv(os.path.join(directory,'train_set.csv'), parse_dates=[2,])
test = pd.read_csv(os.path.join(directory,'test_set.csv'), parse_dates=[3,])
In [4]:
tubes = pd.read_csv(os.path.join(directory,'tube.csv'))
train = pd.merge(train,tubes,on='tube_assembly_id',how='inner')
test = pd.merge(test,tubes,on='tube_assembly_id',how='inner')
train['material_id'].fillna('SP-9999',inplace=True)
test['material_id'].fillna('SP-9999',inplace=True)
In [5]:
materials = pd.read_csv(os.path.join(directory,'bill_of_materials_with_labels.csv'))
In [6]:
materials.columns
Out[6]:
In [7]:
comp_filenames = ['comp_adaptor.csv',
'comp_boss.csv',
'comp_elbow.csv',
'comp_float.csv',
'comp_hfl.csv',
'comp_nut.csv',
'comp_other.csv',
'comp_sleeve.csv',
'comp_straight.csv',
'comp_tee.csv',
'comp_threaded.csv']
comp_files = [pd.read_csv(os.path.join(directory,filename)) for filename in comp_filenames]
In [8]:
"""filename = 'comp_adaptor.csv'
filtered_materials = materials[materials['label']==6].drop('label',1)
comp_file = pd.read_csv(os.path.join(directory,filename)
pd.merge(filtered_materials,comp_file,on='component_id', how='inner').shape[0]"""
Out[8]:
In [9]:
for i in range(1,7):
#filter by instances
filtered_materials = materials[materials['label']==i].drop('label',1)
for comp_file in comp_files:
#join by component type files
mat_comp = pd.merge(filtered_materials,comp_file,on='component_id', how='inner')
#join onto train and test sets
if mat_comp.shape[0]!=0:
train = pd.merge(train,mat_comp,on='tube_assembly_id', how='left')
test = pd.merge(test,mat_comp,on='tube_assembly_id', how='left')
In [10]:
print train.shape
In [11]:
train.fillna(0,inplace=True)
test.fillna(0,inplace=True)
In [12]:
# drop useless columns and create labels
idx = test.id.values.astype(int)
test = test.drop(['id', 'tube_assembly_id', 'quote_date'], axis = 1)
labels = train.cost.values
train = train.drop(['quote_date', 'cost', 'tube_assembly_id'], axis = 1)
In [13]:
# convert data to numpy array
train = np.array(train)
test = np.array(test)
In [14]:
rangeset = train.shape[1]
for i in range(rangeset):
#if type(sorted(train[:,i],reverse=True)[1]) is str:
if any(type(x) is str for x in train[:,i]) or any(type(x) is str for x in test[:,i]):
train[:,i][train[:,i]==0] = 'zzz'
test[:,i][test[:,i]==0] = 'zzz'
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train[:,i]) + list(test[:,i]))
train[:,i] = lbl.transform(train[:,i])
test[:,i] = lbl.transform(test[:,i])
In [15]:
# object array to float
train = train.astype(float)
test = test.astype(float)
# i like to train on log(1+x) for RMSLE ;)
# The choice is yours :)
label_log = np.log1p(labels)
In [16]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(train, label_log, test_size = 0.2, random_state = 0)
In [46]:
#RMSLE error function
import math
#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
assert len(y) == len(y_pred)
terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5
from sklearn.metrics import make_scorer
custom_score = make_scorer(rmsle,False)
In [47]:
testRun = 'No'
In [48]:
# sci-kit random forests
parameters = {'n_estimators': [1000] }
model_rfr = RandomForestRegressor(n_estimators=100)
if testRun == 'Yes':
model_rfr.fit(X_train, y_train)
else:
model_rfr.fit(train,label_log)
In [49]:
if testRun == 'Yes':
val_preds_rfr = np.expm1(model_rfr.predict(X_test))
norm_labels = np.expm1(y_test)
score_val_rfr = rmsle(norm_labels,val_preds_rfr)
print 'Base score is 0.249'
print score_val_rfr
In [50]:
# xgboost random forest model
params = {}
params["objective"] = "reg:linear"
params["eta"] = 0.1
params["min_child_weight"] = 5
params["subsample"] = 1.0
params["scale_pos_weight"] = 1.0
params["silent"] = 1
params["max_depth"] = 7
plst = list(params.items())
if testRun == 'Yes':
xgtrain = xgb.DMatrix(X_train, label = y_train)
xgtest = xgb.DMatrix(X_test)
else:
xgtrain = xgb.DMatrix(train, label=label_log)
xgtest = xgb.DMatrix(test)
num_rounds = 500
model_xgb = xgb.train(plst, xgtrain, num_rounds)
In [51]:
if testRun == 'Yes':
val_preds_xgb = np.expm1(model_xgb.predict(xgtest))
norm_labels = np.expm1(y_test)
score_val_xgb = rmsle(norm_labels,val_preds_xgb)
print 'Base score is 0.207'
print score_val_xgb
In [52]:
model_gbr = GradientBoostingRegressor()
if testRun == 'Yes':
model_gbr.fit(X_train, y_train)
else:
model_gbr.fit(train,label_log)
In [53]:
if testRun == 'Yes':
val_preds_gbr = np.expm1(model_gbr.predict(X_test))
norm_labels = np.expm1(y_test)
score_val_gbr = rmsle(norm_labels,val_preds_gbr)
print 'Base score is 0.327'
print score_val_gbr
In [54]:
if testRun == 'Yes':
X_stack_train = [list(t) for t in zip(model_rfr.predict(X_train),model_xgb.predict(xgtrain),model_gbr.predict(X_train))]
X_stack_test = [list(t) for t in zip(model_rfr.predict(X_test),model_xgb.predict(xgtest),model_gbr.predict(X_test))]
else:
X_stack_train = [list(t) for t in zip(model_rfr.predict(train),model_xgb.predict(xgtrain),model_gbr.predict(train))]
X_stack_test = [list(t) for t in zip(model_rfr.predict(test),model_xgb.predict(xgtest),model_gbr.predict(test))]
In [55]:
model_stack = LinearRegression()
if testRun == 'Yes':
model_stack.fit(X_stack_train,y_train)
else:
model_stack.fit(X_stack_train,label_log)
In [56]:
if testRun == 'Yes':
val_preds_stack = np.expm1(model_stack.predict(X_stack_test))
norm_labels = np.expm1(y_test)
score_val_stack = rmsle(norm_labels,val_preds_stack)
print 'Base score is 0.237'
print score_val_stack
In [57]:
if testRun == 'No':
preds = np.expm1(model_stack.predict(X_stack_test))
preds = pd.DataFrame({"id": idx, "cost": preds})
preds.to_csv('benchmark.csv', index=False)
An analysis into the relationships between the different data files:
test_set is root
sheets with tube assembly ids:
bill_of_materials; components for every tube assembly
specs; specifications for every tube assembly
tube; description for every tube, including spec id, size and bend information, end of tube info
sheets without tube assembly ids with component ids:
comp_adaptor; detailed component specs, adaptor type
comp_boss; detailed component specs, boss type
comp_elbow; detailed component specs, elbow type
comp_float; detailed component specs, float type
comp_hfl; detailed component specs, hfl type
comp_nut; detailed component specs, nut type
comp_other; detailed component specs, other type
comp_sleeve; detailed component specs, sleeve type
comp_straight; detailed component specs, straight type
comp_tee; detailed component specs, tee type
comp_threaded; detailed component specs, threaded type
sheets without tube assembly ids with other ids
tube_end_form; end of tube types, yes/no on forming
type_component; component type id with name
type_connection; connection type id (in some components) with name
type_end_form; end form id (in some components) with name