In [47]:
%pylab inline
import pandas as pd
from sklearn import linear_model


Populating the interactive namespace from numpy and matplotlib

In [9]:
train_df = pd.read_csv('./competition_data/train_set.csv', parse_dates=['quote_date'])
print train_df.shape
print train_df.dtypes
print train_df.sample(n=10, random_state=0)


(30213, 8)
tube_assembly_id              object
supplier                      object
quote_date            datetime64[ns]
annual_usage                   int64
min_order_quantity             int64
bracket_pricing               object
quantity                       int64
cost                         float64
dtype: object
      tube_assembly_id supplier quote_date  annual_usage  min_order_quantity  \
9730          TA-05679   S-0066 2013-08-04             0                   0   
7442          TA-04183   S-0066 2013-10-01             0                   0   
19179         TA-13388   S-0054 2009-05-26             1                   0   
9565          TA-05576   S-0072 2002-08-30            88                   1   
718           TA-00338   S-0066 2013-07-01             0                   0   
14127         TA-08815   S-0058 2008-12-01           129                  35   
252           TA-00108   S-0066 2013-10-01             0                   0   
990           TA-00497   S-0066 2013-09-01             0                   0   
19822         TA-14032   S-0041 2013-08-05             1                  20   
26509         TA-19805   S-0066 2013-08-01             0                   0   

      bracket_pricing  quantity       cost  
9730              Yes         1  28.616259  
7442              Yes       100   6.279107  
19179             Yes         1  20.078120  
9565               No         1   4.434602  
718               Yes        10   4.609872  
14127              No        35   6.167825  
252               Yes       100  11.178313  
990               Yes         5   9.178013  
19822              No        20   6.732583  
26509             Yes         1  21.513663  

In [6]:
train_df[(train_df.bracket_pricing == 'No') & \
         (train_df.quantity > 1) & \
         (train_df.min_order_quantity != train_df.quantity)]


Out[6]:
tube_assembly_id supplier quote_date annual_usage min_order_quantity bracket_pricing quantity cost
14289 TA-08933 S-0066 2011-08-01 52 14 No 10 52.497454
15382 TA-09767 S-0066 2011-08-01 265 32 No 3 24.245645
15819 TA-10173 S-0066 2011-08-01 75 30 No 12 5.358246
20444 TA-14751 S-0066 2011-08-01 245 5 No 4 17.796832

In [10]:
train_df_no_bracket = train_df[(train_df.bracket_pricing == 'No')]

In [32]:
print train_df_no_bracket.tube_assembly_id.unique().size
print train_df_no_bracket.shape


3930
(3930, 8)

In [18]:
bill_materials_df = pd.read_csv('./competition_data/bill_of_materials.csv')
print bill_materials_df.shape
print bill_materials_df.dtypes
print bill_materials_df.sample(10, random_state=0)


(21198, 17)
tube_assembly_id     object
component_id_1       object
quantity_1          float64
component_id_2       object
quantity_2          float64
component_id_3       object
quantity_3          float64
component_id_4       object
quantity_4          float64
component_id_5       object
quantity_5          float64
component_id_6       object
quantity_6          float64
component_id_7       object
quantity_7          float64
component_id_8       object
quantity_8          float64
dtype: object
      tube_assembly_id component_id_1  quantity_1 component_id_2  quantity_2  \
9258          TA-09259         C-1621           2         C-1628           2   
6255          TA-06256         C-0442           1            NaN         NaN   
5653          TA-05654         C-1624           1         C-1625           1   
449           TA-00450            NaN         NaN            NaN         NaN   
14694         TA-14695            NaN         NaN            NaN         NaN   
19962         TA-19964         C-1312           2            NaN         NaN   
12335         TA-12336         C-1638           2            NaN         NaN   
15664         TA-15665         C-1621           2         C-1628           2   
14435         TA-14436         C-1624           1         C-1631           1   
15108         TA-15109         C-1312           2            NaN         NaN   

      component_id_3  quantity_3 component_id_4  quantity_4 component_id_5  \
9258             NaN         NaN            NaN         NaN            NaN   
6255             NaN         NaN            NaN         NaN            NaN   
5653          C-1631           1         C-1652           1            NaN   
449              NaN         NaN            NaN         NaN            NaN   
14694            NaN         NaN            NaN         NaN            NaN   
19962            NaN         NaN            NaN         NaN            NaN   
12335            NaN         NaN            NaN         NaN            NaN   
15664            NaN         NaN            NaN         NaN            NaN   
14435         C-0264           1            NaN         NaN            NaN   
15108            NaN         NaN            NaN         NaN            NaN   

       quantity_5 component_id_6  quantity_6 component_id_7  quantity_7  \
9258          NaN            NaN         NaN            NaN         NaN   
6255          NaN            NaN         NaN            NaN         NaN   
5653          NaN            NaN         NaN            NaN         NaN   
449           NaN            NaN         NaN            NaN         NaN   
14694         NaN            NaN         NaN            NaN         NaN   
19962         NaN            NaN         NaN            NaN         NaN   
12335         NaN            NaN         NaN            NaN         NaN   
15664         NaN            NaN         NaN            NaN         NaN   
14435         NaN            NaN         NaN            NaN         NaN   
15108         NaN            NaN         NaN            NaN         NaN   

      component_id_8  quantity_8  
9258             NaN         NaN  
6255             NaN         NaN  
5653             NaN         NaN  
449              NaN         NaN  
14694            NaN         NaN  
19962            NaN         NaN  
12335            NaN         NaN  
15664            NaN         NaN  
14435            NaN         NaN  
15108            NaN         NaN  

In [42]:
tube_df = pd.read_csv('./competition_data/tube.csv')

In [44]:
train_df_no_bracket_indexed_by_id = train_df_no_bracket.set_index('tube_assembly_id')
bill_materials_df_indexed_by_id = bill_materials_df.set_index('tube_assembly_id')
tube_df_indexed_by_id = tube_df.set_index('tube_assembly_id')
# model_df = train_df_no_bracket_indexed_by_id.join(bill_materials_df_indexed_by_id)
model_df = pd.concat([train_df_no_bracket_indexed_by_id, bill_materials_df_indexed_by_id, tube_df_indexed_by_id], \
          axis=1, join='inner')

In [46]:
print model_df.shape
# print model_df.head()
print model_df.dtypes


(3930, 38)
supplier                      object
quote_date            datetime64[ns]
annual_usage                   int64
min_order_quantity             int64
bracket_pricing               object
quantity                       int64
cost                         float64
component_id_1                object
quantity_1                   float64
component_id_2                object
quantity_2                   float64
component_id_3                object
quantity_3                   float64
component_id_4                object
quantity_4                   float64
component_id_5                object
quantity_5                   float64
component_id_6                object
quantity_6                   float64
component_id_7                object
quantity_7                   float64
component_id_8                object
quantity_8                   float64
material_id                   object
diameter                     float64
wall                         float64
length                       float64
num_bends                      int64
bend_radius                  float64
end_a_1x                      object
end_a_2x                      object
end_x_1x                      object
end_x_2x                      object
end_a                         object
end_x                         object
num_boss                       int64
num_bracket                    int64
other                          int64
dtype: object

In [52]:
model_X = model_df[['annual_usage', 'diameter', 'wall', 'length', 'num_bends']].values
model_y = model_df['cost'].values
# model_X

In [55]:
lin_reg = linear_model.LinearRegression()
lin_reg.fit(model_X, model_y)
lin_reg.score(model_X, model_y)


Out[55]:
0.15001114716437591