In [39]:
%pylab inline
import pandas as pd
from sklearn.cross_validation import LeaveOneOut
from ggplot import *


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['xlim', 'ylim']
`%matplotlib` prevents importing * from pylab and numpy

In [40]:
train_df = pd.read_csv('train.csv', parse_dates=[1])

In [41]:
print train_df.head()


   Id  Open Date        City  City Group Type  P1   P2  P3  P4  P5   ...     \
0   0 1999-07-17    İstanbul  Big Cities   IL   4  5.0   4   4   2   ...      
1   1 2008-02-14      Ankara  Big Cities   FC   4  5.0   4   4   1   ...      
2   2 2013-03-09  Diyarbakır       Other   IL   2  4.0   2   5   2   ...      
3   3 2012-02-02       Tokat       Other   IL   6  4.5   6   6   4   ...      
4   4 2009-05-09   Gaziantep       Other   IL   3  4.0   3   4   2   ...      

   P29  P30  P31  P32  P33  P34  P35  P36  P37  revenue  
0  3.0    5    3    4    5    5    4    3    4  5653753  
1  3.0    0    0    0    0    0    0    0    0  6923131  
2  3.0    0    0    0    0    0    0    0    0  2055379  
3  7.5   25   12   10    6   18   12   12    6  2675511  
4  3.0    5    1    3    2    3    4    3    3  4316715  

[5 rows x 43 columns]

In [42]:
ggplot(aes('Open Date', 'revenue', color='City Group'), data=train_df) + geom_point(size = 24) +\
    ggtitle('Revenue vs Open Date') +\
    geom_smooth(size=2, se=False)


Out[42]:
<ggplot: (8759349054837)>

In [46]:
train_df_wo_outliers = train_df[ train_df.revenue < 10000000 ]
train_df_wo_outliers.shape


Out[46]:
(134, 43)

In [44]:
train_df_wo_outliers.revenue.mean()


Out[44]:
4181438.3656716417

In [45]:
ggplot(aes('Open Date', 'revenue', color='City Group'), data=train_df_wo_outliers) + geom_point(size = 24) +\
    ggtitle('Revenue vs Open Date') +\
    geom_smooth(size=2, se=False)


Out[45]:
<ggplot: (8759351985613)>

In [49]:
loo = LeaveOneOut(134)

In [65]:
test_1_predicts = []
errs = []
for train, test in loo:
    train_1 = train_df_wo_outliers.iloc[ train ]
    test_1_pred = train_1.revenue.mean()
    test_1_predicts.append(test_1_pred)
    test_1_label = train_df_wo_outliers.iloc[ test ].revenue.iloc[0]
    errs.append(test_1_pred - test_1_label)

In [78]:
pd.Series(data=test_1_predicts).hist()


Out[78]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f771e522810>

In [79]:
train_df_wo_outliers.revenue.hist()


Out[79]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f771ea6a950>

In [66]:
err_series = pd.Series(data=errs)

In [67]:
err_series.plot(kind='box')


Out[67]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f771dfa8690>

In [70]:
err_series.hist(bins=20)


Out[70]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f771e04f250>

In [77]:
test_rmse = sqrt( np.mean(err_series ** 2) )
print 'Cross validation estimated test error (RMSE) : \n'
print "{:,}".format(test_rmse)


Cross validation estimated test error (RMSE) : 

1,806,348.34393