In [7]:
%pylab inline
import pandas as pd
Read all files related to components.
In [8]:
components = pd.read_csv('./competition_data/components.csv')
comp_adaptor = pd.read_csv('./competition_data/comp_adaptor.csv')
comp_boss = pd.read_csv('./competition_data/comp_boss.csv')
comp_elbow = pd.read_csv('./competition_data/comp_elbow.csv')
comp_float = pd.read_csv('./competition_data/comp_float.csv')
comp_hfl = pd.read_csv('./competition_data/comp_hfl.csv')
comp_nut = pd.read_csv('./competition_data/comp_nut.csv')
comp_other = pd.read_csv('./competition_data/comp_other.csv')
comp_sleeve = pd.read_csv('./competition_data/comp_sleeve.csv')
comp_straight = pd.read_csv('./competition_data/comp_straight.csv')
comp_tee = pd.read_csv('./competition_data/comp_tee.csv')
comp_threaded = pd.read_csv('./competition_data/comp_threaded.csv')
comp_id_dict = components.set_index('component_id')['component_type_id'].to_dict()
In [9]:
print 'components:'
print sorted(components['component_type_id'].unique())
print 'adaptor:'
print comp_adaptor['component_type_id'].unique()
print 'boss:'
print comp_boss['component_type_id'].unique()
print 'elbow:'
print comp_elbow['component_type_id'].unique()
print 'float:'
print comp_float['component_type_id'].unique()
print 'hfl:'
print comp_hfl['component_type_id'].unique()
print 'nut:'
print comp_nut['component_type_id'].unique()
# comp_other does not contain component_type_id column
print 'sleeve:'
print comp_sleeve['component_type_id'].unique()
print 'straight:'
print comp_straight['component_type_id'].unique()
print 'tee:'
print comp_tee['component_type_id'].unique()
print 'threaded:'
print comp_threaded['component_type_id'].unique()
print comp_adaptor[comp_adaptor['component_id']=='9999']
In [10]:
comp_adaptor
Out[10]:
In [11]:
comp_adaptor[comp_adaptor.component_type_id == 'CP-028'].weight.hist()
Out[11]:
In [12]:
bill_of_materials = pd.read_csv('./competition_data/bill_of_materials.csv')
bill_of_materials.fillna(0,inplace=True)
In [13]:
print bill_of_materials.columns
print bill_of_materials.ix[1,]
In [14]:
comp_list = [comp_adaptor,comp_boss,comp_elbow,comp_float,comp_hfl,comp_nut,comp_other,\
comp_sleeve,comp_straight,comp_tee,comp_threaded]
comp_weight = pd.concat([pd.DataFrame(c,columns=['component_id','weight']) for c in comp_list])
comp_id_weight = comp_weight.set_index('component_id')['weight'].to_dict()
print comp_other[components['component_id']=='9999']
In [15]:
tube_assembly_comp = pd.DataFrame()
tube_assembly_comp['tube_assembly_id'] = bill_of_materials['tube_assembly_id']
tube_assembly_comp['comp_quant'] = bill_of_materials['quantity_1']+bill_of_materials['quantity_2']+\
bill_of_materials['quantity_3']+bill_of_materials['quantity_4']+bill_of_materials['quantity_5']+\
bill_of_materials['quantity_6']+bill_of_materials['quantity_7']+bill_of_materials['quantity_8']
comp_id_weight[0] = 0
# cannot find component_id == '9999' in comp table
# but '9999' can be found in component table
comp_id_weight['9999'] = 0
def f(s):
weight = []
for i in range(1,9):
weight.append(float(comp_id_weight[s['component_id_'+str(i)]]) * s['quantity_'+str(i)])
return sum(weight)
tube_assembly_comp['weight'] = bill_of_materials.apply(f,axis=1)
In [22]:
# tube_assembly_comp.plot()
# plt.show()
tube_assembly_comp.shape
Out[22]:
In [25]:
train_df = pd.read_csv('./competition_data/train_set.csv', parse_dates=['quote_date'])
# train_df.shape
train_df_weight_quant = train_df.set_index('tube_assembly_id').join(tube_assembly_comp.set_index('tube_assembly_id'))
train_df_weight_quant.sample(n=25)
Out[25]:
In [34]:
x_weight = train_df_weight_quant[train_df_weight_quant['quantity']==1]['weight']
y = train_df_weight_quant[train_df_weight_quant['quantity']==1]['cost']
plt.subplot(1,2,1)
plt.scatter(x_weight,y)
plt.title('cost vs. weight')
x_quant = train_df_weight_quant[train_df_weight_quant['quantity']==1]['comp_quant']
plt.subplot(1,2,2)
plt.scatter(x_quant,y)
plt.title('cost vs. comp. quantity')
plt.subplots_adjust()
plt.show()
In [19]:
import statsmodels.formula.api as sm
result = sm.ols(formula="cost ~ weight + quantity", data=train_df_weight_quant).fit()
print result.summary()
In [35]:
result.resid.plot()
Out[35]: