In [42]:
%pylab inline
import pandas as pd
In [43]:
train_set = pd.read_csv('./competition_data/train_set.csv', parse_dates=['quote_date'])
In [44]:
print train_set.shape
print train_set.dtypes
print train_set.sample(10, random_state=0)
In [45]:
train_set.groupby('bracket_pricing').count()
Out[45]:
In [65]:
train_set['year'] = train_set.quote_date.apply(lambda x : x.year)
train_set['month'] = train_set.quote_date.apply(lambda x : x.month)
train_set.drop(['quote_date'], axis=1, inplace=True)
# train_set['quantity_rep'] = train_set.quantity.apply(lambda x : 1.0 / x)
In [66]:
train_set.head()
Out[66]:
In [68]:
train_set.to_csv('train_set_ym.csv', index=False)
In [7]:
train_set_dummies = pd.get_dummies(train_set, columns=['supplier', 'bracket_pricing'])
train_set_dummies = train_set_dummies.drop(['bracket_pricing_No'], axis=1)
In [8]:
train_set_dummies.head()
Out[8]:
In [9]:
# train_set_dummies.to_csv('train_set_adjusted.csv', index=False)
In [10]:
# train_set_no_date = train_set.drop(['quote_date'], axis=1)
In [11]:
# train_set_no_date_dummies = pd.get_dummies(train_set_no_date, columns=['supplier', 'bracket_pricing'])
In [12]:
# train_set_no_date_dummies_drop_bracket_n = \
# train_set_no_date_dummies.drop(['bracket_pricing_No'], axis=1)
In [13]:
# train_set_no_date_dummies_drop_bracket_n.to_csv('train_set_no_date_dummies_drop_bracket_n.csv', index=False)
In [14]:
train_set_no_bracket = train_set[train_set.bracket_pricing == 'No']
In [23]:
train_set_no_bracket.head()
Out[23]:
In [22]:
train_set_no_bracket.year.hist(bins=20)
plt.show()
In [18]:
train_set_yes_bracket = train_set[train_set.bracket_pricing == 'Yes']
In [21]:
train_set_yes_bracket.year.hist(bins=20)
plt.show()
In [41]:
pd.pivot_table(train_set, index='bracket_pricing', aggfunc='count')
Out[41]:
In [40]:
pd.value_counts(train_set_no_bracket.supplier)
Out[40]:
In [39]:
pd.value_counts(train_set_yes_bracket.supplier)
Out[39]:
In [ ]: