notebook.community

Edit and run



In [2]:

    
%pylab inline
import pandas as pd
from sklearn.cross_validation import LeaveOneOut
from ggplot import *









    



Populating the interactive namespace from numpy and matplotlib



In [3]:

    
train_df = pd.read_csv('train.csv', parse_dates=[1])



In [4]:

    
print train_df.head()









    



   Id  Open Date        City  City Group Type  P1   P2  P3  P4  P5   ...     \
0   0 1999-07-17    İstanbul  Big Cities   IL   4  5.0   4   4   2   ...      
1   1 2008-02-14      Ankara  Big Cities   FC   4  5.0   4   4   1   ...      
2   2 2013-03-09  Diyarbakır       Other   IL   2  4.0   2   5   2   ...      
3   3 2012-02-02       Tokat       Other   IL   6  4.5   6   6   4   ...      
4   4 2009-05-09   Gaziantep       Other   IL   3  4.0   3   4   2   ...      

   P29  P30  P31  P32  P33  P34  P35  P36  P37  revenue  
0  3.0    5    3    4    5    5    4    3    4  5653753  
1  3.0    0    0    0    0    0    0    0    0  6923131  
2  3.0    0    0    0    0    0    0    0    0  2055379  
3  7.5   25   12   10    6   18   12   12    6  2675511  
4  3.0    5    1    3    2    3    4    3    3  4316715  

[5 rows x 43 columns]



In [5]:

    
ggplot(aes('Open Date', 'revenue', color='City Group'), data=train_df) + geom_point(size = 24) +\
    ggtitle('Revenue vs Open Date') +\
    geom_smooth(size=2, se=False)









    












    Out[5]:





<ggplot: (8741912887077)>



In [6]:

    
train_df_wo_outliers = train_df[ train_df.revenue < 10000000 ]
train_df_wo_outliers.shape









    Out[6]:





(134, 43)



In [7]:

    
train_df_wo_outliers.revenue.mean()









    Out[7]:





4181438.3656716417



In [8]:

    
ggplot(aes('Open Date', 'revenue', color='City Group'), data=train_df_wo_outliers) + geom_point(size = 24) +\
    ggtitle('Revenue vs Open Date') +\
    geom_smooth(size=2, se=False)









    












    Out[8]:





<ggplot: (8741916107341)>



In [26]:

    
p = ggplot(aes('Open Date', 'revenue', color='Type', shape='Type'), data=train_df_wo_outliers) +\
    geom_point() + geom_smooth(size=2, se=False)
p









    












    Out[26]:





<ggplot: (8741911747925)>



In [13]:

    
import plotly



In [15]:

    
plotly.plotly.plot(p)









    



---------------------------------------------------------------------------
PlotlyError                               Traceback (most recent call last)
<ipython-input-15-cf4cffa8c1d3> in <module>()
----> 1 plotly.plotly.plot(p)

/home/benqing/anaconda/lib/python2.7/site-packages/plotly/plotly/plotly.pyc in plot(figure_or_data, validate, **plot_options)
    166 
    167     """
--> 168     figure = tools.return_figure_from_figure_or_data(figure_or_data, validate)
    169 
    170     for entry in figure['data']:

/home/benqing/anaconda/lib/python2.7/site-packages/plotly/tools.pyc in return_figure_from_figure_or_data(figure_or_data, validate_figure)
   1246         figure = {'data': figure_or_data}
   1247     else:
-> 1248         raise exceptions.PlotlyError("The `figure_or_data` positional "
   1249                                      "argument must be either "
   1250                                      "`dict`-like or `list`-like.")

PlotlyError: The `figure_or_data` positional argument must be either `dict`-like or `list`-like.



In [7]:

    
loo = LeaveOneOut(134)



In [8]:

    
test_1_predicts = []
errs = []
for train, test in loo:
    train_1 = train_df_wo_outliers.iloc[ train ]
    test_1_pred = train_1.revenue.mean()
    test_1_predicts.append(test_1_pred)
    test_1_label = train_df_wo_outliers.iloc[ test ].revenue.iloc[0]
    errs.append(test_1_pred - test_1_label)



In [9]:

    
pd.Series(data=test_1_predicts).hist()









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x1095fcd90>



In [79]:

    
train_df_wo_outliers.revenue.hist()









    Out[79]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f771ea6a950>



In [66]:

    
err_series = pd.Series(data=errs)



In [67]:

    
err_series.plot(kind='box')









    Out[67]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f771dfa8690>



In [70]:

    
err_series.hist(bins=20)









    Out[70]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f771e04f250>



In [77]:

    
test_rmse = sqrt( np.mean(err_series ** 2) )
print 'Cross validation estimated test error (RMSE) : \n'
print "{:,}".format(test_rmse)









    



Cross validation estimated test error (RMSE) : 

1,806,348.34393

How many data points do we have for pre-2007, 2007-2011 and post-2011



In [22]:

    
pre2007_df = train_df_wo_outliers[train_df_wo_outliers['Open Date'] < datetime.datetime(2006,12,31)]
print 'We have {} data points open before 2007'.format(pre2007_df.shape[0])
post2011_df = train_df_wo_outliers[train_df_wo_outliers['Open Date'] > datetime.datetime(2011,12,31)]
print 'We have {} data points open after 2011'.format(post2011_df.shape[0])
print 'We have {} data points open between 2007 and 2011'.format(train_df_wo_outliers.shape[0] - pre2007_df.shape[0] - post2011_df.shape[0])









    



We have 25 data points open before 2007
We have 32 data points open after 2011
We have 77 data points open between 2007 and 2011



In [ ]: