In [2]:
%pylab inline
import pandas as pd
from sklearn.cross_validation import LeaveOneOut
from ggplot import *
In [3]:
train_df = pd.read_csv('train.csv', parse_dates=[1])
In [4]:
print train_df.head()
In [5]:
ggplot(aes('Open Date', 'revenue', color='City Group'), data=train_df) + geom_point(size = 24) +\
ggtitle('Revenue vs Open Date') +\
geom_smooth(size=2, se=False)
Out[5]:
In [6]:
train_df_wo_outliers = train_df[ train_df.revenue < 10000000 ]
train_df_wo_outliers.shape
Out[6]:
In [7]:
train_df_wo_outliers.revenue.mean()
Out[7]:
In [8]:
ggplot(aes('Open Date', 'revenue', color='City Group'), data=train_df_wo_outliers) + geom_point(size = 24) +\
ggtitle('Revenue vs Open Date') +\
geom_smooth(size=2, se=False)
Out[8]:
In [26]:
p = ggplot(aes('Open Date', 'revenue', color='Type', shape='Type'), data=train_df_wo_outliers) +\
geom_point() + geom_smooth(size=2, se=False)
p
Out[26]:
In [13]:
import plotly
In [15]:
plotly.plotly.plot(p)
In [7]:
loo = LeaveOneOut(134)
In [8]:
test_1_predicts = []
errs = []
for train, test in loo:
train_1 = train_df_wo_outliers.iloc[ train ]
test_1_pred = train_1.revenue.mean()
test_1_predicts.append(test_1_pred)
test_1_label = train_df_wo_outliers.iloc[ test ].revenue.iloc[0]
errs.append(test_1_pred - test_1_label)
In [9]:
pd.Series(data=test_1_predicts).hist()
Out[9]:
In [79]:
train_df_wo_outliers.revenue.hist()
Out[79]:
In [66]:
err_series = pd.Series(data=errs)
In [67]:
err_series.plot(kind='box')
Out[67]:
In [70]:
err_series.hist(bins=20)
Out[70]:
In [77]:
test_rmse = sqrt( np.mean(err_series ** 2) )
print 'Cross validation estimated test error (RMSE) : \n'
print "{:,}".format(test_rmse)
In [22]:
pre2007_df = train_df_wo_outliers[train_df_wo_outliers['Open Date'] < datetime.datetime(2006,12,31)]
print 'We have {} data points open before 2007'.format(pre2007_df.shape[0])
post2011_df = train_df_wo_outliers[train_df_wo_outliers['Open Date'] > datetime.datetime(2011,12,31)]
print 'We have {} data points open after 2011'.format(post2011_df.shape[0])
print 'We have {} data points open between 2007 and 2011'.format(train_df_wo_outliers.shape[0] - pre2007_df.shape[0] - post2011_df.shape[0])
In [ ]: