In [1]:
%pylab inline
import pandas as pd


Populating the interactive namespace from numpy and matplotlib

In [2]:
test_set = pd.read_csv('competition_data/test_set.csv')

In [3]:
print test_set.shape
print test_set.dtypes
test_set.sample(n=5, random_state=0)


(30235, 8)
id                     int64
tube_assembly_id      object
supplier              object
quote_date            object
annual_usage           int64
min_order_quantity     int64
bracket_pricing       object
quantity               int64
dtype: object
Out[3]:
id tube_assembly_id supplier quote_date annual_usage min_order_quantity bracket_pricing quantity
7898 7899 TA-04496 S-0066 2013-09-01 0 0 Yes 100
3977 3978 TA-02094 S-0041 2012-03-26 1 0 Yes 5
8255 8256 TA-04719 S-0066 2012-05-11 5 0 Yes 2
9304 9305 TA-05313 S-0066 2013-11-02 1 0 Yes 250
20327 20328 TA-14257 S-0066 2012-08-13 78 0 Yes 6

In [4]:
test_set_by_bracket = test_set.groupby(by='bracket_pricing')

In [5]:
test_set_by_bracket.count()


Out[5]:
id tube_assembly_id supplier quote_date annual_usage min_order_quantity quantity
bracket_pricing
No 3956 3956 3956 3956 3956 3956 3956
Yes 26279 26279 26279 26279 26279 26279 26279

In [7]:
test_set_no_date = test_set.drop(['quote_date'], axis=1)

In [11]:
test_set_no_date_dummies = pd.get_dummies(test_set_no_date, columns=['supplier', 'bracket_pricing'])
# test_set_no_date_dummies.dtypes

In [10]:
test_set_no_date_dummies_drop_bracket_no = test_set_no_date_dummies.drop(['bracket_pricing_No'], axis=1)

In [ ]:
# test_set_no_date_dummies_drop_bracket_no.to_csv('')

In [12]:
train_set = pd.read_csv('./competition_data/train_set.csv')

In [14]:
train_ta_ids = train_set.tube_assembly_id.values
test_ta_ids = test_set.tube_assembly_id.values

In [15]:
np.intersect1d(train_ta_ids, test_ta_ids)


Out[15]:
array([], dtype=object)

In [16]:
train_suppliers = train_set.supplier.values
test_suppliers = test_set.supplier.values

In [18]:
np.intersect1d(train_suppliers, test_suppliers)


Out[18]:
array(['S-0004', 'S-0005', 'S-0006', 'S-0007', 'S-0008', 'S-0009',
       'S-0011', 'S-0012', 'S-0013', 'S-0014', 'S-0015', 'S-0018',
       'S-0023', 'S-0026', 'S-0027', 'S-0030', 'S-0031', 'S-0041',
       'S-0042', 'S-0043', 'S-0046', 'S-0050', 'S-0051', 'S-0054',
       'S-0056', 'S-0058', 'S-0059', 'S-0060', 'S-0061', 'S-0062',
       'S-0064', 'S-0066', 'S-0068', 'S-0070', 'S-0072', 'S-0074',
       'S-0078', 'S-0080', 'S-0081', 'S-0087', 'S-0090', 'S-0092',
       'S-0104', 'S-0105', 'S-0108'], dtype=object)

In [21]:
np.unique(test_suppliers)


Out[21]:
array(['S-0004', 'S-0005', 'S-0006', 'S-0007', 'S-0008', 'S-0009',
       'S-0011', 'S-0012', 'S-0013', 'S-0014', 'S-0015', 'S-0018',
       'S-0023', 'S-0026', 'S-0027', 'S-0028', 'S-0030', 'S-0031',
       'S-0033', 'S-0036', 'S-0039', 'S-0040', 'S-0041', 'S-0042',
       'S-0043', 'S-0046', 'S-0050', 'S-0051', 'S-0054', 'S-0056',
       'S-0058', 'S-0059', 'S-0060', 'S-0061', 'S-0062', 'S-0064',
       'S-0066', 'S-0068', 'S-0069', 'S-0070', 'S-0072', 'S-0073',
       'S-0074', 'S-0076', 'S-0077', 'S-0078', 'S-0080', 'S-0081',
       'S-0087', 'S-0088', 'S-0090', 'S-0091', 'S-0092', 'S-0104',
       'S-0105', 'S-0108'], dtype=object)

In [26]:
test_set_only_suppliers = np.setdiff1d(test_suppliers, train_suppliers)
test_set_only_suppliers_columns = ['supplier_' + s for s in test_set_only_suppliers]

In [25]:
test_set_no_date_dummies_drop_bracket_no.columns


Out[25]:
Index([u'id', u'tube_assembly_id', u'annual_usage', u'min_order_quantity',
       u'quantity', u'supplier_S-0004', u'supplier_S-0005', u'supplier_S-0006',
       u'supplier_S-0007', u'supplier_S-0008', u'supplier_S-0009',
       u'supplier_S-0011', u'supplier_S-0012', u'supplier_S-0013',
       u'supplier_S-0014', u'supplier_S-0015', u'supplier_S-0018',
       u'supplier_S-0023', u'supplier_S-0026', u'supplier_S-0027',
       u'supplier_S-0028', u'supplier_S-0030', u'supplier_S-0031',
       u'supplier_S-0033', u'supplier_S-0036', u'supplier_S-0039',
       u'supplier_S-0040', u'supplier_S-0041', u'supplier_S-0042',
       u'supplier_S-0043', u'supplier_S-0046', u'supplier_S-0050',
       u'supplier_S-0051', u'supplier_S-0054', u'supplier_S-0056',
       u'supplier_S-0058', u'supplier_S-0059', u'supplier_S-0060',
       u'supplier_S-0061', u'supplier_S-0062', u'supplier_S-0064',
       u'supplier_S-0066', u'supplier_S-0068', u'supplier_S-0069',
       u'supplier_S-0070', u'supplier_S-0072', u'supplier_S-0073',
       u'supplier_S-0074', u'supplier_S-0076', u'supplier_S-0077',
       u'supplier_S-0078', u'supplier_S-0080', u'supplier_S-0081',
       u'supplier_S-0087', u'supplier_S-0088', u'supplier_S-0090',
       u'supplier_S-0091', u'supplier_S-0092', u'supplier_S-0104',
       u'supplier_S-0105', u'supplier_S-0108', u'bracket_pricing_Yes'],
      dtype='object')

In [27]:
test_set_no_date_dummies_drop_bracket_no_drop_suppliers = \
test_set_no_date_dummies_drop_bracket_no.drop(test_set_only_suppliers_columns, axis=1)

In [28]:
test_set_no_date_dummies_drop_bracket_no_drop_suppliers.head()


Out[28]:
id tube_assembly_id annual_usage min_order_quantity quantity supplier_S-0004 supplier_S-0005 supplier_S-0006 supplier_S-0007 supplier_S-0008 ... supplier_S-0078 supplier_S-0080 supplier_S-0081 supplier_S-0087 supplier_S-0090 supplier_S-0092 supplier_S-0104 supplier_S-0105 supplier_S-0108 bracket_pricing_Yes
0 1 TA-00001 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
1 2 TA-00001 0 0 2 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2 3 TA-00001 0 0 5 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
3 4 TA-00001 0 0 10 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
4 5 TA-00001 0 0 25 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 51 columns


In [29]:
test_set_no_date_dummies_drop_bracket_no_drop_suppliers.to_csv('test_set_no_date_dummies_drop_bracket_no_drop_suppliers.csv', index=False)