notebook.community

Edit and run



In [39]:

    
%pylab inline
import pandas as pd
from sklearn.cross_validation import LeaveOneOut
from ggplot import *









    



Populating the interactive namespace from numpy and matplotlib






    



WARNING: pylab import has clobbered these variables: ['xlim', 'ylim']
`%matplotlib` prevents importing * from pylab and numpy



In [40]:

    
train_df = pd.read_csv('train.csv', parse_dates=[1])



In [41]:

    
print train_df.head()









    



   Id  Open Date        City  City Group Type  P1   P2  P3  P4  P5   ...     \
0   0 1999-07-17    İstanbul  Big Cities   IL   4  5.0   4   4   2   ...      
1   1 2008-02-14      Ankara  Big Cities   FC   4  5.0   4   4   1   ...      
2   2 2013-03-09  Diyarbakır       Other   IL   2  4.0   2   5   2   ...      
3   3 2012-02-02       Tokat       Other   IL   6  4.5   6   6   4   ...      
4   4 2009-05-09   Gaziantep       Other   IL   3  4.0   3   4   2   ...      

   P29  P30  P31  P32  P33  P34  P35  P36  P37  revenue  
0  3.0    5    3    4    5    5    4    3    4  5653753  
1  3.0    0    0    0    0    0    0    0    0  6923131  
2  3.0    0    0    0    0    0    0    0    0  2055379  
3  7.5   25   12   10    6   18   12   12    6  2675511  
4  3.0    5    1    3    2    3    4    3    3  4316715  

[5 rows x 43 columns]



In [42]:

    
ggplot(aes('Open Date', 'revenue', color='City Group'), data=train_df) + geom_point(size = 24) +\
    ggtitle('Revenue vs Open Date') +\
    geom_smooth(size=2, se=False)









    












    Out[42]:





<ggplot: (8759349054837)>



In [46]:

    
train_df_wo_outliers = train_df[ train_df.revenue < 10000000 ]
train_df_wo_outliers.shape









    Out[46]:





(134, 43)



In [44]:

    
train_df_wo_outliers.revenue.mean()









    Out[44]:





4181438.3656716417



In [45]:

    
ggplot(aes('Open Date', 'revenue', color='City Group'), data=train_df_wo_outliers) + geom_point(size = 24) +\
    ggtitle('Revenue vs Open Date') +\
    geom_smooth(size=2, se=False)









    












    Out[45]:





<ggplot: (8759351985613)>



In [49]:

    
loo = LeaveOneOut(134)



In [65]:

    
test_1_predicts = []
errs = []
for train, test in loo:
    train_1 = train_df_wo_outliers.iloc[ train ]
    test_1_pred = train_1.revenue.mean()
    test_1_predicts.append(test_1_pred)
    test_1_label = train_df_wo_outliers.iloc[ test ].revenue.iloc[0]
    errs.append(test_1_pred - test_1_label)



In [78]:

    
pd.Series(data=test_1_predicts).hist()









    Out[78]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f771e522810>



In [79]:

    
train_df_wo_outliers.revenue.hist()









    Out[79]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f771ea6a950>



In [66]:

    
err_series = pd.Series(data=errs)



In [67]:

    
err_series.plot(kind='box')









    Out[67]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f771dfa8690>



In [70]:

    
err_series.hist(bins=20)









    Out[70]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f771e04f250>



In [77]:

    
test_rmse = sqrt( np.mean(err_series ** 2) )
print 'Cross validation estimated test error (RMSE) : \n'
print "{:,}".format(test_rmse)









    



Cross validation estimated test error (RMSE) : 

1,806,348.34393