In [1]:
%pylab inline
import pandas as pd
In [26]:
train_df = pd.read_csv('./competition_data/train_set.csv', parse_dates=['quote_date'])
Add two columns, year and month, to the dataframe.
In [31]:
train_df['year'] = train_df.quote_date.apply(lambda d : d.year)
train_df['month'] = train_df.quote_date.apply(lambda d : d.month)
How many cost(i.e. records) are there in each (year, month).
In [73]:
cost_count_by_year_month = train_df.groupby(['year', 'month']).cost.count()
plt.subplots(figsize=(8,6))
cost_count_by_year_month.plot()
plt.title('The number of cost records in each year and month', {'fontsize' : 'large'})
plt.ylabel('count', {'fontsize' : 'large'})
plt.grid()
plt.show()
We see that from 2011, the number of records increases significantly.
In [51]:
cost_count_by_year_month.ix[2011:]
Out[51]:
Let's plot the median cost in each (year, month) group.
In [74]:
median_cost_by_year_month = train_df.groupby(by=['year', 'month']).cost.median()
median_cost_by_year_month.plot()
plt.title('Median cost in each (year, month)')
plt.ylabel('cost')
plt.grid()
plt.show()
But since only after 2011 there exists sufficient amount of quotes, let's zoom in to the post 2011 period.
In [75]:
median_cost_by_year_month.ix[2011:].plot()
plt.title('Median cost in each (year, month)')
plt.ylabel('cost')
plt.grid()
plt.show()
The 'outliers' in the median cost space are shown below.
In [77]:
print median_cost_by_year_month[median_cost_by_year_month>50]
We might be more interested in the distribution of the cost within each (year, month).
In [91]:
cost_by_year_month = train_df.groupby(by=['year', 'month']).cost
fig, ax = plt.subplots(figsize=(8, 6))
train_df.boxplot(column='cost', by='year', ax=ax)
# cost_by_year_month.plot(kind='box')
plt.xticks(rotation=90)
plt.show()
In [27]:
train_df_no_bracket = train_df[train_df.bracket_pricing == 'No']
In [28]:
train_df_no_bracket
Out[28]:
In [ ]: