In [1]:
%pylab inline
import pandas as pd
In [2]:
test_set = pd.read_csv('competition_data/test_set.csv')
In [3]:
print test_set.shape
print test_set.dtypes
test_set.sample(n=5, random_state=0)
Out[3]:
In [4]:
test_set_by_bracket = test_set.groupby(by='bracket_pricing')
In [5]:
test_set_by_bracket.count()
Out[5]:
In [7]:
test_set_no_date = test_set.drop(['quote_date'], axis=1)
In [11]:
test_set_no_date_dummies = pd.get_dummies(test_set_no_date, columns=['supplier', 'bracket_pricing'])
# test_set_no_date_dummies.dtypes
In [10]:
test_set_no_date_dummies_drop_bracket_no = test_set_no_date_dummies.drop(['bracket_pricing_No'], axis=1)
In [ ]:
# test_set_no_date_dummies_drop_bracket_no.to_csv('')
In [12]:
train_set = pd.read_csv('./competition_data/train_set.csv')
In [14]:
train_ta_ids = train_set.tube_assembly_id.values
test_ta_ids = test_set.tube_assembly_id.values
In [15]:
np.intersect1d(train_ta_ids, test_ta_ids)
Out[15]:
In [16]:
train_suppliers = train_set.supplier.values
test_suppliers = test_set.supplier.values
In [18]:
np.intersect1d(train_suppliers, test_suppliers)
Out[18]:
In [21]:
np.unique(test_suppliers)
Out[21]:
In [26]:
test_set_only_suppliers = np.setdiff1d(test_suppliers, train_suppliers)
test_set_only_suppliers_columns = ['supplier_' + s for s in test_set_only_suppliers]
In [25]:
test_set_no_date_dummies_drop_bracket_no.columns
Out[25]:
In [27]:
test_set_no_date_dummies_drop_bracket_no_drop_suppliers = \
test_set_no_date_dummies_drop_bracket_no.drop(test_set_only_suppliers_columns, axis=1)
In [28]:
test_set_no_date_dummies_drop_bracket_no_drop_suppliers.head()
Out[28]:
In [29]:
test_set_no_date_dummies_drop_bracket_no_drop_suppliers.to_csv('test_set_no_date_dummies_drop_bracket_no_drop_suppliers.csv', index=False)