notebook.community

Edit and run



In [28]:

    
%pylab inline
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import linear_model









    



Populating the interactive namespace from numpy and matplotlib



In [29]:

    
train = pd.read_csv('train.csv')
new_labels = train.columns.values
new_labels[-1] = 'total_rentals'
train.columns = new_labels
train[:5]

print train.ix[:,9:12].shape

fig,ax = plt.subplots(1,3)
ax[0].plot(train.ix[:,9])
ax[1].plot(train.ix[:,10])
ax[2].plot(train.ix[:,11])
plt.show()



In [30]:

    
X = train.ix[:,1:9]
hours = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S').hour for d in train.datetime]
X.insert(0,'time',hours)
y = train.ix[:,9:12]
print X.shape, y.shape

pca2 = PCA(n_components=2)
X_2 = pca2.fit_transform(X)
print X_2.shape









    



(10886, 9) (10886, 3)
(10886, 2)



In [31]:

    
test_set = pd.read_csv('test.csv')
hours = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S').hour for d in test_set.datetime]
test_set.insert(1,'time',hours)
print test_set.shape
test_set[:5]









    



(6493, 10)






    Out[31]:






  
    
      
      datetime
      time
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
    
  
  
    
      0
       2011-01-20 00:00:00
       0
       1
       0
       1
       1
       10.66
       11.365
       56
       26.0027
    
    
      1
       2011-01-20 01:00:00
       1
       1
       0
       1
       1
       10.66
       13.635
       56
        0.0000
    
    
      2
       2011-01-20 02:00:00
       2
       1
       0
       1
       1
       10.66
       13.635
       56
        0.0000
    
    
      3
       2011-01-20 03:00:00
       3
       1
       0
       1
       1
       10.66
       12.880
       56
       11.0014
    
    
      4
       2011-01-20 04:00:00
       4
       1
       0
       1
       1
       10.66
       12.880
       56
       11.0014
    
  

5 rows × 10 columns



In [32]:

    
lm = linear_model.LinearRegression()
lm.fit(X,y)
print lm.score(X,y)
y_ = lm.predict(test_set.ix[:,1:])
fig,ax = plt.subplots(1,4)
for i,d in enumerate(y_.T):
    ax[i].plot(d)
ax[3].plot(y.ix[:,0]+y.ix[:,1])









    



0.316606687211






    Out[32]:





[<matplotlib.lines.Line2D at 0x116ac6fd0>]

Write out the submission!



In [26]:

    
sample_submission = pd.read_csv('sampleSubmission.csv')
new_labels = sample_submission.columns.values
new_labels[-1] = 'total_rentals'
sample_submission.columns = new_labels
sample_submission[:5]

my_submission = sample_submission.copy()
my_submission.total_rentals = np.round(y_[:,2])
my_submission.loc[my_submission.total_rentals < 0,'total_rentals'] = 0
plt.plot(my_submission.total_rentals)
plt.show()
print my_submission.shape
my_submission[:5]









    












    



(6493, 2)






    Out[26]:






  
    
      
      datetime
      total_rentals
    
  
  
    
      0
       2011-01-20 00:00:00
       15
    
    
      1
       2011-01-20 01:00:00
       23
    
    
      2
       2011-01-20 02:00:00
       30
    
    
      3
       2011-01-20 03:00:00
       39
    
    
      4
       2011-01-20 04:00:00
       46
    
  

5 rows × 2 columns



In [27]:

    
new_labels = my_submission.columns.values
new_labels[-1] = 'count'
my_submission.columns = new_labels
my_submission.to_csv('my_submission.csv',index=False)



In [ ]:

	datetime	time	season	workingday	weather	temp	atemp	humidity	windspeed
0	2011-01-20 00:00:00	0	1	1	1	10.66	11.365	56	26.0027
1	2011-01-20 01:00:00	1	1	1	1	10.66	13.635	56	0.0000
2	2011-01-20 02:00:00	2	1	1	1	10.66	13.635	56	0.0000
3	2011-01-20 03:00:00	3	1	1	1	10.66	12.880	56	11.0014
4	2011-01-20 04:00:00	4	1	1	1	10.66	12.880	56	11.0014