In [1]:
%pylab inline
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

In [49]:
test_set = pd.read_csv('competition_data/test_set.csv', parse_dates=['quote_date'])

In [50]:
print test_set.shape
print test_set.dtypes
test_set.sample(n=5, random_state=0)


(30235, 8)
id                             int64
tube_assembly_id              object
supplier                      object
quote_date            datetime64[ns]
annual_usage                   int64
min_order_quantity             int64
bracket_pricing               object
quantity                       int64
dtype: object
Out[50]:
id tube_assembly_id supplier quote_date annual_usage min_order_quantity bracket_pricing quantity
7898 7899 TA-04496 S-0066 2013-09-01 0 0 Yes 100
3977 3978 TA-02094 S-0041 2012-03-26 1 0 Yes 5
8255 8256 TA-04719 S-0066 2012-05-11 5 0 Yes 2
9304 9305 TA-05313 S-0066 2013-11-02 1 0 Yes 250
20327 20328 TA-14257 S-0066 2012-08-13 78 0 Yes 6

In [4]:
test_set_by_bracket = test_set.groupby(by='bracket_pricing')

In [5]:
test_set_by_bracket.count()


Out[5]:
id tube_assembly_id supplier quote_date annual_usage min_order_quantity quantity
bracket_pricing
No 3956 3956 3956 3956 3956 3956 3956
Yes 26279 26279 26279 26279 26279 26279 26279

In [51]:
test_set['year'] = test_set.quote_date.apply(lambda x : x.year)
test_set['quantity_rep'] = test_set.quantity.apply(lambda x : 1.0 / x)

In [7]:
# test_set_no_date = test_set.drop(['quote_date'], axis=1)

In [11]:
# test_set_no_date_dummies = pd.get_dummies(test_set_no_date, columns=['supplier', 'bracket_pricing'])
# test_set_no_date_dummies.dtypes

In [10]:
# test_set_no_date_dummies_drop_bracket_no = test_set_no_date_dummies.drop(['bracket_pricing_No'], axis=1)

In [ ]:
# test_set_no_date_dummies_drop_bracket_no.to_csv('')

In [52]:
train_set = pd.read_csv('./competition_data/train_set.csv', parse_dates=['quote_date'])

In [53]:
train_set['year'] = train_set.quote_date.apply(lambda x : x.year)
train_set['quantity_rep'] = train_set.quantity.apply(lambda x : 1.0 / x)

In [14]:
train_ta_ids = train_set.tube_assembly_id.values
test_ta_ids = test_set.tube_assembly_id.values

In [15]:
np.intersect1d(train_ta_ids, test_ta_ids)


Out[15]:
array([], dtype=object)

In [16]:
train_suppliers = train_set.supplier.values
test_suppliers = test_set.supplier.values

In [18]:
# np.intersect1d(train_suppliers, test_suppliers)


Out[18]:
array(['S-0004', 'S-0005', 'S-0006', 'S-0007', 'S-0008', 'S-0009',
       'S-0011', 'S-0012', 'S-0013', 'S-0014', 'S-0015', 'S-0018',
       'S-0023', 'S-0026', 'S-0027', 'S-0030', 'S-0031', 'S-0041',
       'S-0042', 'S-0043', 'S-0046', 'S-0050', 'S-0051', 'S-0054',
       'S-0056', 'S-0058', 'S-0059', 'S-0060', 'S-0061', 'S-0062',
       'S-0064', 'S-0066', 'S-0068', 'S-0070', 'S-0072', 'S-0074',
       'S-0078', 'S-0080', 'S-0081', 'S-0087', 'S-0090', 'S-0092',
       'S-0104', 'S-0105', 'S-0108'], dtype=object)

In [21]:
# np.unique(test_suppliers)


Out[21]:
array(['S-0004', 'S-0005', 'S-0006', 'S-0007', 'S-0008', 'S-0009',
       'S-0011', 'S-0012', 'S-0013', 'S-0014', 'S-0015', 'S-0018',
       'S-0023', 'S-0026', 'S-0027', 'S-0028', 'S-0030', 'S-0031',
       'S-0033', 'S-0036', 'S-0039', 'S-0040', 'S-0041', 'S-0042',
       'S-0043', 'S-0046', 'S-0050', 'S-0051', 'S-0054', 'S-0056',
       'S-0058', 'S-0059', 'S-0060', 'S-0061', 'S-0062', 'S-0064',
       'S-0066', 'S-0068', 'S-0069', 'S-0070', 'S-0072', 'S-0073',
       'S-0074', 'S-0076', 'S-0077', 'S-0078', 'S-0080', 'S-0081',
       'S-0087', 'S-0088', 'S-0090', 'S-0091', 'S-0092', 'S-0104',
       'S-0105', 'S-0108'], dtype=object)

In [30]:
test_set_only_suppliers = np.setdiff1d(train_suppliers, test_suppliers)


Out[30]:
array(['S-0003', 'S-0022', 'S-0024', 'S-0025', 'S-0029', 'S-0095',
       'S-0096', 'S-0097', 'S-0106', 'S-0107', 'S-0109', 'S-0111'], dtype=object)

In [26]:
test_set_only_suppliers = np.setdiff1d(test_suppliers, train_suppliers)
test_set_only_suppliers_columns = ['supplier_' + s for s in test_set_only_suppliers]

In [25]:
# test_set_no_date_dummies_drop_bracket_no.columns


Out[25]:
Index([u'id', u'tube_assembly_id', u'annual_usage', u'min_order_quantity',
       u'quantity', u'supplier_S-0004', u'supplier_S-0005', u'supplier_S-0006',
       u'supplier_S-0007', u'supplier_S-0008', u'supplier_S-0009',
       u'supplier_S-0011', u'supplier_S-0012', u'supplier_S-0013',
       u'supplier_S-0014', u'supplier_S-0015', u'supplier_S-0018',
       u'supplier_S-0023', u'supplier_S-0026', u'supplier_S-0027',
       u'supplier_S-0028', u'supplier_S-0030', u'supplier_S-0031',
       u'supplier_S-0033', u'supplier_S-0036', u'supplier_S-0039',
       u'supplier_S-0040', u'supplier_S-0041', u'supplier_S-0042',
       u'supplier_S-0043', u'supplier_S-0046', u'supplier_S-0050',
       u'supplier_S-0051', u'supplier_S-0054', u'supplier_S-0056',
       u'supplier_S-0058', u'supplier_S-0059', u'supplier_S-0060',
       u'supplier_S-0061', u'supplier_S-0062', u'supplier_S-0064',
       u'supplier_S-0066', u'supplier_S-0068', u'supplier_S-0069',
       u'supplier_S-0070', u'supplier_S-0072', u'supplier_S-0073',
       u'supplier_S-0074', u'supplier_S-0076', u'supplier_S-0077',
       u'supplier_S-0078', u'supplier_S-0080', u'supplier_S-0081',
       u'supplier_S-0087', u'supplier_S-0088', u'supplier_S-0090',
       u'supplier_S-0091', u'supplier_S-0092', u'supplier_S-0104',
       u'supplier_S-0105', u'supplier_S-0108', u'bracket_pricing_Yes'],
      dtype='object')

In [27]:
# test_set_no_date_dummies_drop_bracket_no_drop_suppliers = \
# test_set_no_date_dummies_drop_bracket_no.drop(test_set_only_suppliers_columns, axis=1)

In [54]:
# test_set_no_date_dummies_drop_bracket_no_drop_suppliers.head()

In [29]:
# test_set_no_date_dummies_drop_bracket_no_drop_suppliers.to_csv('test_set_no_date_dummies_drop_bracket_no_drop_suppliers.csv', index=False)

In [55]:
test_in_train_format = test_set.drop(['id'], axis=1)
test_in_train_format['cost'] = 0
print test_in_train_format.shape


(30235, 10)

In [56]:
train_test_stack = pd.concat([train_set, test_in_train_format])

In [57]:
print train_test_stack.shape
print train_test_stack.dtypes


(60448, 10)
annual_usage                   int64
bracket_pricing               object
cost                         float64
min_order_quantity             int64
quantity                       int64
quantity_rep                 float64
quote_date            datetime64[ns]
supplier                      object
tube_assembly_id              object
year                           int64
dtype: object

In [58]:
train_test_stack_dummies = pd.get_dummies(train_test_stack, columns=['supplier', 'bracket_pricing'])

In [59]:
test_dummies = (train_test_stack_dummies[train_test_stack_dummies.cost == 0]).drop(test_set_only_suppliers_columns + ['bracket_pricing_No'], axis=1)
test_dummies['id'] = test_set['id']

In [60]:
test_dummies.head()


Out[60]:
annual_usage cost min_order_quantity quantity quantity_rep quote_date tube_assembly_id year supplier_S-0003 supplier_S-0004 ... supplier_S-0097 supplier_S-0104 supplier_S-0105 supplier_S-0106 supplier_S-0107 supplier_S-0108 supplier_S-0109 supplier_S-0111 bracket_pricing_Yes id
0 0 0 0 1 1.00 2013-06-23 TA-00001 2013 0 0 ... 0 0 0 0 0 0 0 0 1 1
1 0 0 0 2 0.50 2013-06-23 TA-00001 2013 0 0 ... 0 0 0 0 0 0 0 0 1 2
2 0 0 0 5 0.20 2013-06-23 TA-00001 2013 0 0 ... 0 0 0 0 0 0 0 0 1 3
3 0 0 0 10 0.10 2013-06-23 TA-00001 2013 0 0 ... 0 0 0 0 0 0 0 0 1 4
4 0 0 0 25 0.04 2013-06-23 TA-00001 2013 0 0 ... 0 0 0 0 0 0 0 0 1 5

5 rows × 67 columns


In [61]:
test_dummies.to_csv('test_dummies_adjusted.csv', index=False)