notebook.community

Edit and run



In [1]:

    
%pylab inline
import pandas as pd









    



Populating the interactive namespace from numpy and matplotlib



In [49]:

    
test_set = pd.read_csv('competition_data/test_set.csv', parse_dates=['quote_date'])



In [50]:

    
print test_set.shape
print test_set.dtypes
test_set.sample(n=5, random_state=0)









    



(30235, 8)
id                             int64
tube_assembly_id              object
supplier                      object
quote_date            datetime64[ns]
annual_usage                   int64
min_order_quantity             int64
bracket_pricing               object
quantity                       int64
dtype: object






    Out[50]:






  
    
      
      id
      tube_assembly_id
      supplier
      quote_date
      annual_usage
      min_order_quantity
      bracket_pricing
      quantity
    
  
  
    
      7898
      7899
      TA-04496
      S-0066
      2013-09-01
      0
      0
      Yes
      100
    
    
      3977
      3978
      TA-02094
      S-0041
      2012-03-26
      1
      0
      Yes
      5
    
    
      8255
      8256
      TA-04719
      S-0066
      2012-05-11
      5
      0
      Yes
      2
    
    
      9304
      9305
      TA-05313
      S-0066
      2013-11-02
      1
      0
      Yes
      250
    
    
      20327
      20328
      TA-14257
      S-0066
      2012-08-13
      78
      0
      Yes
      6



In [4]:

    
test_set_by_bracket = test_set.groupby(by='bracket_pricing')



In [5]:

    
test_set_by_bracket.count()









    Out[5]:






  
    
      
      id
      tube_assembly_id
      supplier
      quote_date
      annual_usage
      min_order_quantity
      quantity
    
    
      bracket_pricing
      
      
      
      
      
      
      
    
  
  
    
      No
      3956
      3956
      3956
      3956
      3956
      3956
      3956
    
    
      Yes
      26279
      26279
      26279
      26279
      26279
      26279
      26279



In [51]:

    
test_set['year'] = test_set.quote_date.apply(lambda x : x.year)
test_set['quantity_rep'] = test_set.quantity.apply(lambda x : 1.0 / x)



In [7]:

    
# test_set_no_date = test_set.drop(['quote_date'], axis=1)



In [11]:

    
# test_set_no_date_dummies = pd.get_dummies(test_set_no_date, columns=['supplier', 'bracket_pricing'])
# test_set_no_date_dummies.dtypes



In [10]:

    
# test_set_no_date_dummies_drop_bracket_no = test_set_no_date_dummies.drop(['bracket_pricing_No'], axis=1)



In [ ]:

    
# test_set_no_date_dummies_drop_bracket_no.to_csv('')



In [52]:

    
train_set = pd.read_csv('./competition_data/train_set.csv', parse_dates=['quote_date'])



In [53]:

    
train_set['year'] = train_set.quote_date.apply(lambda x : x.year)
train_set['quantity_rep'] = train_set.quantity.apply(lambda x : 1.0 / x)



In [14]:

    
train_ta_ids = train_set.tube_assembly_id.values
test_ta_ids = test_set.tube_assembly_id.values



In [15]:

    
np.intersect1d(train_ta_ids, test_ta_ids)









    Out[15]:





array([], dtype=object)



In [16]:

    
train_suppliers = train_set.supplier.values
test_suppliers = test_set.supplier.values



In [18]:

    
# np.intersect1d(train_suppliers, test_suppliers)









    Out[18]:





array(['S-0004', 'S-0005', 'S-0006', 'S-0007', 'S-0008', 'S-0009',
       'S-0011', 'S-0012', 'S-0013', 'S-0014', 'S-0015', 'S-0018',
       'S-0023', 'S-0026', 'S-0027', 'S-0030', 'S-0031', 'S-0041',
       'S-0042', 'S-0043', 'S-0046', 'S-0050', 'S-0051', 'S-0054',
       'S-0056', 'S-0058', 'S-0059', 'S-0060', 'S-0061', 'S-0062',
       'S-0064', 'S-0066', 'S-0068', 'S-0070', 'S-0072', 'S-0074',
       'S-0078', 'S-0080', 'S-0081', 'S-0087', 'S-0090', 'S-0092',
       'S-0104', 'S-0105', 'S-0108'], dtype=object)



In [21]:

    
# np.unique(test_suppliers)









    Out[21]:





array(['S-0004', 'S-0005', 'S-0006', 'S-0007', 'S-0008', 'S-0009',
       'S-0011', 'S-0012', 'S-0013', 'S-0014', 'S-0015', 'S-0018',
       'S-0023', 'S-0026', 'S-0027', 'S-0028', 'S-0030', 'S-0031',
       'S-0033', 'S-0036', 'S-0039', 'S-0040', 'S-0041', 'S-0042',
       'S-0043', 'S-0046', 'S-0050', 'S-0051', 'S-0054', 'S-0056',
       'S-0058', 'S-0059', 'S-0060', 'S-0061', 'S-0062', 'S-0064',
       'S-0066', 'S-0068', 'S-0069', 'S-0070', 'S-0072', 'S-0073',
       'S-0074', 'S-0076', 'S-0077', 'S-0078', 'S-0080', 'S-0081',
       'S-0087', 'S-0088', 'S-0090', 'S-0091', 'S-0092', 'S-0104',
       'S-0105', 'S-0108'], dtype=object)



In [30]:

    
test_set_only_suppliers = np.setdiff1d(train_suppliers, test_suppliers)









    Out[30]:





array(['S-0003', 'S-0022', 'S-0024', 'S-0025', 'S-0029', 'S-0095',
       'S-0096', 'S-0097', 'S-0106', 'S-0107', 'S-0109', 'S-0111'], dtype=object)



In [26]:

    
test_set_only_suppliers = np.setdiff1d(test_suppliers, train_suppliers)
test_set_only_suppliers_columns = ['supplier_' + s for s in test_set_only_suppliers]



In [25]:

    
# test_set_no_date_dummies_drop_bracket_no.columns









    Out[25]:





Index([u'id', u'tube_assembly_id', u'annual_usage', u'min_order_quantity',
       u'quantity', u'supplier_S-0004', u'supplier_S-0005', u'supplier_S-0006',
       u'supplier_S-0007', u'supplier_S-0008', u'supplier_S-0009',
       u'supplier_S-0011', u'supplier_S-0012', u'supplier_S-0013',
       u'supplier_S-0014', u'supplier_S-0015', u'supplier_S-0018',
       u'supplier_S-0023', u'supplier_S-0026', u'supplier_S-0027',
       u'supplier_S-0028', u'supplier_S-0030', u'supplier_S-0031',
       u'supplier_S-0033', u'supplier_S-0036', u'supplier_S-0039',
       u'supplier_S-0040', u'supplier_S-0041', u'supplier_S-0042',
       u'supplier_S-0043', u'supplier_S-0046', u'supplier_S-0050',
       u'supplier_S-0051', u'supplier_S-0054', u'supplier_S-0056',
       u'supplier_S-0058', u'supplier_S-0059', u'supplier_S-0060',
       u'supplier_S-0061', u'supplier_S-0062', u'supplier_S-0064',
       u'supplier_S-0066', u'supplier_S-0068', u'supplier_S-0069',
       u'supplier_S-0070', u'supplier_S-0072', u'supplier_S-0073',
       u'supplier_S-0074', u'supplier_S-0076', u'supplier_S-0077',
       u'supplier_S-0078', u'supplier_S-0080', u'supplier_S-0081',
       u'supplier_S-0087', u'supplier_S-0088', u'supplier_S-0090',
       u'supplier_S-0091', u'supplier_S-0092', u'supplier_S-0104',
       u'supplier_S-0105', u'supplier_S-0108', u'bracket_pricing_Yes'],
      dtype='object')



In [27]:

    
# test_set_no_date_dummies_drop_bracket_no_drop_suppliers = \
# test_set_no_date_dummies_drop_bracket_no.drop(test_set_only_suppliers_columns, axis=1)



In [54]:

    
# test_set_no_date_dummies_drop_bracket_no_drop_suppliers.head()



In [29]:

    
# test_set_no_date_dummies_drop_bracket_no_drop_suppliers.to_csv('test_set_no_date_dummies_drop_bracket_no_drop_suppliers.csv', index=False)



In [55]:

    
test_in_train_format = test_set.drop(['id'], axis=1)
test_in_train_format['cost'] = 0
print test_in_train_format.shape









    



(30235, 10)



In [56]:

    
train_test_stack = pd.concat([train_set, test_in_train_format])



In [57]:

    
print train_test_stack.shape
print train_test_stack.dtypes









    



(60448, 10)
annual_usage                   int64
bracket_pricing               object
cost                         float64
min_order_quantity             int64
quantity                       int64
quantity_rep                 float64
quote_date            datetime64[ns]
supplier                      object
tube_assembly_id              object
year                           int64
dtype: object



In [58]:

    
train_test_stack_dummies = pd.get_dummies(train_test_stack, columns=['supplier', 'bracket_pricing'])



In [59]:

    
test_dummies = (train_test_stack_dummies[train_test_stack_dummies.cost == 0]).drop(test_set_only_suppliers_columns + ['bracket_pricing_No'], axis=1)
test_dummies['id'] = test_set['id']



In [60]:

    
test_dummies.head()









    Out[60]:






  
    
      
      annual_usage
      cost
      min_order_quantity
      quantity
      quantity_rep
      quote_date
      tube_assembly_id
      year
      supplier_S-0003
      supplier_S-0004
      ...
      supplier_S-0097
      supplier_S-0104
      supplier_S-0105
      supplier_S-0106
      supplier_S-0107
      supplier_S-0108
      supplier_S-0109
      supplier_S-0111
      bracket_pricing_Yes
      id
    
  
  
    
      0
      0
      0
      0
      1
      1.00
      2013-06-23
      TA-00001
      2013
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      1
    
    
      1
      0
      0
      0
      2
      0.50
      2013-06-23
      TA-00001
      2013
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      2
    
    
      2
      0
      0
      0
      5
      0.20
      2013-06-23
      TA-00001
      2013
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      3
    
    
      3
      0
      0
      0
      10
      0.10
      2013-06-23
      TA-00001
      2013
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      4
    
    
      4
      0
      0
      0
      25
      0.04
      2013-06-23
      TA-00001
      2013
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      1
      5
    
  

5 rows × 67 columns



In [61]:

    
test_dummies.to_csv('test_dummies_adjusted.csv', index=False)

	id	tube_assembly_id	supplier	quote_date	annual_usage	bracket_pricing	quantity
7898	7899	TA-04496	S-0066	2013-09-01	0	Yes	100
3977	3978	TA-02094	S-0041	2012-03-26	1	Yes	5
8255	8256	TA-04719	S-0066	2012-05-11	5	Yes	2
9304	9305	TA-05313	S-0066	2013-11-02	1	Yes	250
20327	20328	TA-14257	S-0066	2012-08-13	78	Yes	6

	id	tube_assembly_id	supplier	quote_date	annual_usage	min_order_quantity	quantity
bracket_pricing
No	3956	3956	3956	3956	3956	3956	3956
Yes	26279	26279	26279	26279	26279	26279	26279

	quantity	quantity_rep	quote_date	tube_assembly_id	year	...	bracket_pricing_Yes	id
0	1	1.00	2013-06-23	TA-00001	2013	...	1	1
1	2	0.50	2013-06-23	TA-00001	2013	...	1	2
2	5	0.20	2013-06-23	TA-00001	2013	...	1	3
3	10	0.10	2013-06-23	TA-00001	2013	...	1	4
4	25	0.04	2013-06-23	TA-00001	2013	...	1	5