In [1]:
%pylab inline
import pandas as pd
In [49]:
test_set = pd.read_csv('competition_data/test_set.csv', parse_dates=['quote_date'])
In [50]:
print test_set.shape
print test_set.dtypes
test_set.sample(n=5, random_state=0)
Out[50]:
In [4]:
test_set_by_bracket = test_set.groupby(by='bracket_pricing')
In [5]:
test_set_by_bracket.count()
Out[5]:
In [51]:
test_set['year'] = test_set.quote_date.apply(lambda x : x.year)
test_set['quantity_rep'] = test_set.quantity.apply(lambda x : 1.0 / x)
In [7]:
# test_set_no_date = test_set.drop(['quote_date'], axis=1)
In [11]:
# test_set_no_date_dummies = pd.get_dummies(test_set_no_date, columns=['supplier', 'bracket_pricing'])
# test_set_no_date_dummies.dtypes
In [10]:
# test_set_no_date_dummies_drop_bracket_no = test_set_no_date_dummies.drop(['bracket_pricing_No'], axis=1)
In [ ]:
# test_set_no_date_dummies_drop_bracket_no.to_csv('')
In [52]:
train_set = pd.read_csv('./competition_data/train_set.csv', parse_dates=['quote_date'])
In [53]:
train_set['year'] = train_set.quote_date.apply(lambda x : x.year)
train_set['quantity_rep'] = train_set.quantity.apply(lambda x : 1.0 / x)
In [14]:
train_ta_ids = train_set.tube_assembly_id.values
test_ta_ids = test_set.tube_assembly_id.values
In [15]:
np.intersect1d(train_ta_ids, test_ta_ids)
Out[15]:
In [16]:
train_suppliers = train_set.supplier.values
test_suppliers = test_set.supplier.values
In [18]:
# np.intersect1d(train_suppliers, test_suppliers)
Out[18]:
In [21]:
# np.unique(test_suppliers)
Out[21]:
In [30]:
test_set_only_suppliers = np.setdiff1d(train_suppliers, test_suppliers)
Out[30]:
In [26]:
test_set_only_suppliers = np.setdiff1d(test_suppliers, train_suppliers)
test_set_only_suppliers_columns = ['supplier_' + s for s in test_set_only_suppliers]
In [25]:
# test_set_no_date_dummies_drop_bracket_no.columns
Out[25]:
In [27]:
# test_set_no_date_dummies_drop_bracket_no_drop_suppliers = \
# test_set_no_date_dummies_drop_bracket_no.drop(test_set_only_suppliers_columns, axis=1)
In [54]:
# test_set_no_date_dummies_drop_bracket_no_drop_suppliers.head()
In [29]:
# test_set_no_date_dummies_drop_bracket_no_drop_suppliers.to_csv('test_set_no_date_dummies_drop_bracket_no_drop_suppliers.csv', index=False)
In [55]:
test_in_train_format = test_set.drop(['id'], axis=1)
test_in_train_format['cost'] = 0
print test_in_train_format.shape
In [56]:
train_test_stack = pd.concat([train_set, test_in_train_format])
In [57]:
print train_test_stack.shape
print train_test_stack.dtypes
In [58]:
train_test_stack_dummies = pd.get_dummies(train_test_stack, columns=['supplier', 'bracket_pricing'])
In [59]:
test_dummies = (train_test_stack_dummies[train_test_stack_dummies.cost == 0]).drop(test_set_only_suppliers_columns + ['bracket_pricing_No'], axis=1)
test_dummies['id'] = test_set['id']
In [60]:
test_dummies.head()
Out[60]:
In [61]:
test_dummies.to_csv('test_dummies_adjusted.csv', index=False)