In [7]:
%pylab inline
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

Read all files related to components.


In [8]:
components = pd.read_csv('./competition_data/components.csv')
comp_adaptor = pd.read_csv('./competition_data/comp_adaptor.csv')
comp_boss = pd.read_csv('./competition_data/comp_boss.csv')
comp_elbow = pd.read_csv('./competition_data/comp_elbow.csv')
comp_float = pd.read_csv('./competition_data/comp_float.csv')
comp_hfl = pd.read_csv('./competition_data/comp_hfl.csv')
comp_nut = pd.read_csv('./competition_data/comp_nut.csv')
comp_other = pd.read_csv('./competition_data/comp_other.csv')
comp_sleeve = pd.read_csv('./competition_data/comp_sleeve.csv')
comp_straight = pd.read_csv('./competition_data/comp_straight.csv')
comp_tee = pd.read_csv('./competition_data/comp_tee.csv')
comp_threaded = pd.read_csv('./competition_data/comp_threaded.csv')
comp_id_dict = components.set_index('component_id')['component_type_id'].to_dict()

In [9]:
print 'components:'
print sorted(components['component_type_id'].unique())
print 'adaptor:'
print comp_adaptor['component_type_id'].unique()
print 'boss:'
print comp_boss['component_type_id'].unique()
print 'elbow:'
print comp_elbow['component_type_id'].unique()
print 'float:'
print comp_float['component_type_id'].unique()
print 'hfl:'
print comp_hfl['component_type_id'].unique()
print 'nut:'
print comp_nut['component_type_id'].unique()
# comp_other does not contain component_type_id column
print 'sleeve:'
print comp_sleeve['component_type_id'].unique()
print 'straight:'
print comp_straight['component_type_id'].unique()
print 'tee:'
print comp_tee['component_type_id'].unique()
print 'threaded:'
print comp_threaded['component_type_id'].unique()

print comp_adaptor[comp_adaptor['component_id']=='9999']


components:
['CP-001', 'CP-002', 'CP-003', 'CP-004', 'CP-005', 'CP-006', 'CP-007', 'CP-008', 'CP-009', 'CP-010', 'CP-011', 'CP-012', 'CP-014', 'CP-015', 'CP-016', 'CP-017', 'CP-018', 'CP-019', 'CP-020', 'CP-021', 'CP-022', 'CP-023', 'CP-024', 'CP-025', 'CP-026', 'CP-027', 'CP-028', 'CP-029', 'OTHER']
adaptor:
['CP-028' 'CP-029']
boss:
['CP-018' 'CP-019' 'CP-020']
elbow:
['CP-008' 'CP-009' 'CP-010' 'CP-011' 'CP-012']
float:
['CP-021' 'CP-022']
hfl:
['CP-023']
nut:
['CP-025' 'CP-026' 'CP-027']
sleeve:
['CP-024']
straight:
['CP-001' 'CP-002' 'CP-003' 'CP-004' 'CP-005' 'CP-006' 'CP-007']
tee:
['OTHER']
threaded:
['CP-014' 'CP-015' 'CP-016' 'CP-017']
Empty DataFrame
Columns: [component_id, component_type_id, adaptor_angle, overall_length, end_form_id_1, connection_type_id_1, length_1, thread_size_1, thread_pitch_1, nominal_size_1, end_form_id_2, connection_type_id_2, length_2, thread_size_2, thread_pitch_2, nominal_size_2, hex_size, unique_feature, orientation, weight]
Index: []

In [10]:
comp_adaptor


Out[10]:
component_id component_type_id adaptor_angle overall_length end_form_id_1 connection_type_id_1 length_1 thread_size_1 thread_pitch_1 nominal_size_1 end_form_id_2 connection_type_id_2 length_2 thread_size_2 thread_pitch_2 nominal_size_2 hex_size unique_feature orientation weight
0 C-0005 CP-028 NaN 58.40 A-001 B-001 NaN 1.312 12.0 NaN A-001 B-004 NaN 1.000 11.5 NaN 34.93 No No 0.206
1 C-0006 CP-028 NaN 34.80 A-001 B-001 NaN 0.437 20.0 NaN A-001 B-005 NaN 0.750 16.0 NaN 22.20 No No 0.083
2 C-1435 CP-028 NaN 20.30 A-007 B-004 NaN NaN NaN 15.88 A-001 B-007 NaN 0.875 18.0 NaN 22.22 No No 0.023
3 C-1546 CP-028 NaN 26.40 A-007 B-004 NaN 0.125 27.0 NaN A-001 B-004 NaN 0.125 27.0 NaN 15.88 No No 0.026
4 C-1583 CP-028 NaN 44.50 A-001 B-005 NaN 1.312 12.0 NaN A-007 B-005 NaN 1.062 12.0 NaN 38.10 No No 0.256
5 C-1634 CP-028 NaN 34.50 A-001 B-005 NaN 0.750 16.0 NaN A-001 B-002 NaN 0.687 16.0 NaN 22.23 No No 0.060
6 C-1975 CP-028 NaN 13.20 A-007 B-007 NaN NaN NaN 3.18 A-001 B-007 NaN 0.312 28.0 NaN NaN No No 0.005
7 C-0428 CP-028 NaN 26.99 A-001 B-004 NaN 0.250 18.0 NaN A-007 NaN NaN NaN NaN 9.52 17.46 No No 0.032
8 C-0443 CP-028 NaN 22.35 A-007 B-007 NaN NaN NaN 19.05 9999 9999 NaN 1.062 16.0 NaN 26.97 No No NaN
9 C-0823 CP-028 NaN 16.80 A-007 B-007 NaN NaN NaN 9.52 A-001 9999 NaN 0.625 18.0 9.52 15.75 No No 0.014
10 C-0354 CP-028 NaN 36.30 A-001 B-004 NaN 1.250 11.5 NaN A-007 B-004 NaN 1.000 11.5 NaN 44.45 No No 0.136
11 C-0361 CP-028 NaN 41.00 A-005 NaN NaN NaN NaN 6.35 A-006 B-002 NaN 0.562 18.0 NaN 15.88 No No 0.005
12 C-1312 CP-028 NaN 14.20 A-002 B-007 NaN 0.437 24.0 NaN 9999 9999 NaN NaN NaN NaN 11.11 No No 0.009
13 C-1313 CP-028 NaN 15.70 A-007 B-007 NaN NaN NaN 7.94 A-001 B-007 NaN 0.500 20.0 NaN 12.57 No No 0.010
14 C-1194 CP-028 NaN 15.00 A-007 B-004 NaN 0.125 27.0 NaN A-007 B-004 NaN 0.125 27.0 NaN NaN No No 0.014
15 C-1195 CP-028 NaN 30.00 A-007 B-004 NaN 0.250 18.0 NaN A-007 B-004 NaN 0.250 18.0 NaN NaN Yes No 0.031
16 C-1196 CP-028 NaN 40.00 A-007 B-004 NaN 0.500 14.0 NaN A-007 B-004 NaN 0.500 14.0 NaN NaN No No 0.095
17 C-1197 CP-028 NaN 41.00 A-007 B-004 NaN 0.750 14.0 NaN A-007 B-004 NaN 0.750 14.0 NaN NaN No No 0.091
18 C-1198 CP-028 NaN 51.00 A-007 B-004 NaN 1.000 11.5 NaN A-007 B-004 NaN 1.000 11.5 NaN NaN No No 0.181
19 C-1229 CP-028 NaN 18.80 A-007 B-004 NaN NaN NaN 12.70 A-001 B-007 NaN 0.750 18.0 NaN 18.92 No No 0.009
20 C-1230 CP-028 NaN 14.20 A-007 B-007 NaN NaN NaN 4.76 A-001 B-005 NaN 0.375 24.0 NaN 9.40 No No 0.009
21 C-1695 CP-028 NaN 30.00 A-001 B-006 NaN 0.750 16.0 NaN A-007 9999 NaN 9999.000 9999.0 NaN 25.40 No No NaN
22 C-1812 CP-028 NaN 35.10 A-007 B-004 NaN 0.375 18.0 NaN A-007 B-004 NaN 0.375 18.0 NaN NaN No No 0.091
23 C-1828 CP-028 NaN 28.40 A-001 B-005 NaN 1.312 12.0 NaN A-007 B-005 NaN 0.562 18.0 NaN 38.10 No No 0.181
24 C-1868 CP-029 90 NaN A-001 B-005 65.5 0.750 16.0 NaN A-001 B-002 28 0.812 16.0 NaN NaN No Yes 0.226

In [11]:
comp_adaptor[comp_adaptor.component_type_id == 'CP-028'].weight.hist()


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x107960cd0>

In [12]:
bill_of_materials = pd.read_csv('./competition_data/bill_of_materials.csv')
bill_of_materials.fillna(0,inplace=True)

In [13]:
print bill_of_materials.columns
print bill_of_materials.ix[1,]


Index([u'tube_assembly_id', u'component_id_1', u'quantity_1',
       u'component_id_2', u'quantity_2', u'component_id_3', u'quantity_3',
       u'component_id_4', u'quantity_4', u'component_id_5', u'quantity_5',
       u'component_id_6', u'quantity_6', u'component_id_7', u'quantity_7',
       u'component_id_8', u'quantity_8'],
      dtype='object')
tube_assembly_id    TA-00002
component_id_1        C-1312
quantity_1                 2
component_id_2             0
quantity_2                 0
component_id_3             0
quantity_3                 0
component_id_4             0
quantity_4                 0
component_id_5             0
quantity_5                 0
component_id_6             0
quantity_6                 0
component_id_7             0
quantity_7                 0
component_id_8             0
quantity_8                 0
Name: 1, dtype: object

In [14]:
comp_list = [comp_adaptor,comp_boss,comp_elbow,comp_float,comp_hfl,comp_nut,comp_other,\
            comp_sleeve,comp_straight,comp_tee,comp_threaded]
comp_weight = pd.concat([pd.DataFrame(c,columns=['component_id','weight']) for c in comp_list])
comp_id_weight = comp_weight.set_index('component_id')['weight'].to_dict()
print comp_other[components['component_id']=='9999']


  component_id   part_name  weight
0       C-1385  NUT-FLARED   0.014
/Users/benqingshen/anaconda/lib/python2.7/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)

In [15]:
tube_assembly_comp = pd.DataFrame()
tube_assembly_comp['tube_assembly_id'] = bill_of_materials['tube_assembly_id']
tube_assembly_comp['comp_quant'] = bill_of_materials['quantity_1']+bill_of_materials['quantity_2']+\
bill_of_materials['quantity_3']+bill_of_materials['quantity_4']+bill_of_materials['quantity_5']+\
bill_of_materials['quantity_6']+bill_of_materials['quantity_7']+bill_of_materials['quantity_8']
comp_id_weight[0] = 0
# cannot find component_id == '9999' in comp table
# but '9999' can be found in component table
comp_id_weight['9999'] = 0
def f(s):
    weight = []
    for i in range(1,9):
        weight.append(float(comp_id_weight[s['component_id_'+str(i)]]) * s['quantity_'+str(i)])
    return sum(weight)
tube_assembly_comp['weight'] = bill_of_materials.apply(f,axis=1)

In [22]:
# tube_assembly_comp.plot()
# plt.show()
tube_assembly_comp.shape


Out[22]:
(21198, 3)

In [25]:
train_df = pd.read_csv('./competition_data/train_set.csv', parse_dates=['quote_date'])
# train_df.shape
train_df_weight_quant = train_df.set_index('tube_assembly_id').join(tube_assembly_comp.set_index('tube_assembly_id'))
train_df_weight_quant.sample(n=25)


Out[25]:
supplier quote_date annual_usage min_order_quantity bracket_pricing quantity cost comp_quant weight
tube_assembly_id
TA-04022 S-0062 2005-05-09 19 1 Yes 1 12.878151 2 0.538
TA-04609 S-0066 2013-08-01 0 0 Yes 50 2.823790 4 0.042
TA-05816 S-0066 2013-06-01 0 0 Yes 10 6.980186 4 0.096
TA-06295 S-0066 2013-08-04 0 0 Yes 2 16.978906 3 0.051
TA-02417 S-0066 2013-07-01 0 0 Yes 5 8.707845 4 0.042
TA-15827 S-0041 2012-11-20 1 0 Yes 20 2.498289 4 0.122
TA-10946 S-0066 2013-08-11 0 0 Yes 50 2.693033 4 0.028
TA-07354 S-0072 2012-10-25 43 0 Yes 60 4.153614 4 0.048
TA-19403 S-0066 2013-09-01 0 0 Yes 2 11.350801 2 0.020
TA-11626 S-0066 2013-10-01 0 0 Yes 2 9.701040 0 0.000
TA-03176 S-0072 2007-02-01 10 1 No 1 12.714010 4 0.110
TA-20935 S-0066 2013-08-11 0 0 Yes 250 2.350840 2 0.018
TA-00173 S-0066 2013-12-02 1 0 Yes 10 6.345877 3 0.114
TA-03130 S-0066 2013-09-01 0 0 Yes 10 5.931350 4 0.124
TA-01639 S-0072 1996-01-23 0 1 No 1 4.659949 4 0.028
TA-07983 S-0066 2013-11-02 1 0 Yes 5 6.373697 4 0.042
TA-02025 S-0041 2012-06-14 20 0 Yes 25 3.978344 4 0.041
TA-04170 S-0066 2013-07-01 0 0 Yes 2 11.801495 4 0.042
TA-00764 S-0066 2013-09-01 0 0 Yes 1 21.777958 4 0.124
TA-08080 S-0066 2013-10-01 0 0 Yes 5 8.348959 4 0.324
TA-19224 S-0066 2013-08-01 0 0 Yes 1 20.528814 2 0.028
TA-11386 S-0026 2013-07-18 700 1 No 1 4.156396 2 0.100
TA-15170 S-0041 2012-06-01 1 0 Yes 1 31.559678 2 0.048
TA-01899 S-0026 2012-03-15 12 0 Yes 5 45.909539 1 0.046
TA-00847 S-0066 2013-10-01 0 0 Yes 50 3.616677 4 0.324

In [34]:
x_weight = train_df_weight_quant[train_df_weight_quant['quantity']==1]['weight']
y = train_df_weight_quant[train_df_weight_quant['quantity']==1]['cost']
plt.subplot(1,2,1)
plt.scatter(x_weight,y)
plt.title('cost vs. weight')
x_quant = train_df_weight_quant[train_df_weight_quant['quantity']==1]['comp_quant']
plt.subplot(1,2,2)
plt.scatter(x_quant,y)
plt.title('cost vs. comp. quantity')
plt.subplots_adjust()
plt.show()



In [19]:
import statsmodels.formula.api as sm
result = sm.ols(formula="cost ~ weight + quantity", data=train_df_weight_quant).fit()
print result.summary()


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   cost   R-squared:                       0.199
Model:                            OLS   Adj. R-squared:                  0.199
Method:                 Least Squares   F-statistic:                     3730.
Date:                Thu, 09 Jul 2015   Prob (F-statistic):               0.00
Time:                        21:48:08   Log-Likelihood:            -1.4042e+05
No. Observations:               30109   AIC:                         2.809e+05
Df Residuals:                   30106   BIC:                         2.809e+05
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     12.0728      0.175     69.033      0.000        11.730    12.416
weight        18.8433      0.234     80.508      0.000        18.385    19.302
quantity      -0.0572      0.002    -27.369      0.000        -0.061    -0.053
==============================================================================
Omnibus:                    54066.536   Durbin-Watson:                   1.281
Prob(Omnibus):                  0.000   Jarque-Bera (JB):        114022207.714
Skew:                          12.928   Prob(JB):                         0.00
Kurtosis:                     303.365   Cond. No.                         133.
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

In [35]:
result.resid.plot()


Out[35]:
<matplotlib.axes._subplots.AxesSubplot at 0x10ca76fd0>