In [39]:
%pylab inline
import pandas as pd
from sklearn.cross_validation import LeaveOneOut
from ggplot import *
In [40]:
train_df = pd.read_csv('train.csv', parse_dates=[1])
In [41]:
print train_df.head()
In [42]:
ggplot(aes('Open Date', 'revenue', color='City Group'), data=train_df) + geom_point(size = 24) +\
ggtitle('Revenue vs Open Date') +\
geom_smooth(size=2, se=False)
Out[42]:
In [46]:
train_df_wo_outliers = train_df[ train_df.revenue < 10000000 ]
train_df_wo_outliers.shape
Out[46]:
In [44]:
train_df_wo_outliers.revenue.mean()
Out[44]:
In [45]:
ggplot(aes('Open Date', 'revenue', color='City Group'), data=train_df_wo_outliers) + geom_point(size = 24) +\
ggtitle('Revenue vs Open Date') +\
geom_smooth(size=2, se=False)
Out[45]:
In [49]:
loo = LeaveOneOut(134)
In [65]:
test_1_predicts = []
errs = []
for train, test in loo:
train_1 = train_df_wo_outliers.iloc[ train ]
test_1_pred = train_1.revenue.mean()
test_1_predicts.append(test_1_pred)
test_1_label = train_df_wo_outliers.iloc[ test ].revenue.iloc[0]
errs.append(test_1_pred - test_1_label)
In [78]:
pd.Series(data=test_1_predicts).hist()
Out[78]:
In [79]:
train_df_wo_outliers.revenue.hist()
Out[79]:
In [66]:
err_series = pd.Series(data=errs)
In [67]:
err_series.plot(kind='box')
Out[67]:
In [70]:
err_series.hist(bins=20)
Out[70]:
In [77]:
test_rmse = sqrt( np.mean(err_series ** 2) )
print 'Cross validation estimated test error (RMSE) : \n'
print "{:,}".format(test_rmse)