notebook.community

Edit and run



In [1]:

    
%pylab inline
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import linear_model









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
train = pd.read_csv('train.csv')
new_labels = train.columns.values
new_labels[-1] = 'total_rentals'
train.columns = new_labels
train[:5]









    Out[2]:






  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      total_rentals
    
  
  
    
      0
       2011-01-01 00:00:00
       1
       0
       0
       1
       9.84
       14.395
       81
       0
       3
       13
       16
    
    
      1
       2011-01-01 01:00:00
       1
       0
       0
       1
       9.02
       13.635
       80
       0
       8
       32
       40
    
    
      2
       2011-01-01 02:00:00
       1
       0
       0
       1
       9.02
       13.635
       80
       0
       5
       27
       32
    
    
      3
       2011-01-01 03:00:00
       1
       0
       0
       1
       9.84
       14.395
       75
       0
       3
       10
       13
    
    
      4
       2011-01-01 04:00:00
       1
       0
       0
       1
       9.84
       14.395
       75
       0
       0
        1
        1
    
  

5 rows × 12 columns



In [3]:

    
plt.plot(train.casual)
plt.show()
plt.plot(train.registered)
plt.show()
plt.plot(train.total_rentals)
plt.show()



In [4]:

    
X = train.ix[:,1:9]
for i,col in enumerate(X.columns[4:8]):
    X.insert(i+8, col + str(2), X[col]**.5)
dts = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in train.datetime]
months = [d.month for d in dts]
hours = [d.hour for d in dts]
X.insert(0,'hour',hours)
X.insert(0,'month',months)

X[:5]









    Out[4]:






  
    
      
      month
      hour
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      temp2
      atemp2
      humidity2
      windspeed2
    
  
  
    
      0
       1
       0
       1
       0
       0
       1
       9.84
       14.395
       81
       0
       3.136877
       3.794074
       9.000000
       0
    
    
      1
       1
       1
       1
       0
       0
       1
       9.02
       13.635
       80
       0
       3.003331
       3.692560
       8.944272
       0
    
    
      2
       1
       2
       1
       0
       0
       1
       9.02
       13.635
       80
       0
       3.003331
       3.692560
       8.944272
       0
    
    
      3
       1
       3
       1
       0
       0
       1
       9.84
       14.395
       75
       0
       3.136877
       3.794074
       8.660254
       0
    
    
      4
       1
       4
       1
       0
       0
       1
       9.84
       14.395
       75
       0
       3.136877
       3.794074
       8.660254
       0
    
  

5 rows × 14 columns



In [5]:

    
y = train.ix[:,-1:-4:-1]
y[:5]









    Out[5]:






  
    
      
      total_rentals
      registered
      casual
    
  
  
    
      0
       16
       13
       3
    
    
      1
       40
       32
       8
    
    
      2
       32
       27
       5
    
    
      3
       13
       10
       3
    
    
      4
        1
        1
       0
    
  

5 rows × 3 columns



In [6]:

    
test_set = pd.read_csv('test.csv')
for i,col in enumerate(test_set.columns[5:9]):
    test_set.insert(i+9, col + str(2), test_set[col]**.5)
dts = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in test_set.datetime]
months = [d.month for d in dts]
hours = [d.hour for d in dts]
print len(months), len(hours)
test_set.insert(1,'hour',hours)
test_set.insert(1,'month',months)
print test_set.shape
test_set[:5]









    



6493 6493
(6493, 15)






    Out[6]:






  
    
      
      datetime
      month
      hour
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      temp2
      atemp2
      humidity2
      windspeed2
    
  
  
    
      0
       2011-01-20 00:00:00
       1
       0
       1
       0
       1
       1
       10.66
       11.365
       56
       26.0027
       3.264966
       3.371202
       7.483315
       5.099284
    
    
      1
       2011-01-20 01:00:00
       1
       1
       1
       0
       1
       1
       10.66
       13.635
       56
        0.0000
       3.264966
       3.692560
       7.483315
       0.000000
    
    
      2
       2011-01-20 02:00:00
       1
       2
       1
       0
       1
       1
       10.66
       13.635
       56
        0.0000
       3.264966
       3.692560
       7.483315
       0.000000
    
    
      3
       2011-01-20 03:00:00
       1
       3
       1
       0
       1
       1
       10.66
       12.880
       56
       11.0014
       3.264966
       3.588872
       7.483315
       3.316836
    
    
      4
       2011-01-20 04:00:00
       1
       4
       1
       0
       1
       1
       10.66
       12.880
       56
       11.0014
       3.264966
       3.588872
       7.483315
       3.316836
    
  

5 rows × 15 columns



In [7]:

    
lm = linear_model.LinearRegression(fit_intercept=True)
y_tot = y_reg = y_cas = np.asarray([])
mean_score = 0
for m in range(1,13):
    lm.fit(X[X.month == m],y[X.month == m])
    mean_score += lm.score(X[X.month == m],y[X.month == m])/12
    pred = lm.predict(test_set.ix[test_set.month == m,1:])
    y_tot = np.append(y_tot, pred[:,0])
    y_reg = np.append(y_reg, pred[:,1])
    y_cas = np.append(y_cas, pred[:,2])

print mean_score
    
y_cas[y_cas < 0] = 0
y_reg[y_reg < 0] = 0    
    
plt.plot(y_cas)
plt.plot(y_reg)
plt.plot(y_cas+y_reg)









    



0.331143451575






    Out[7]:





[<matplotlib.lines.Line2D at 0x10b3dda50>]

Write out the submission!



In [8]:

    
sample_submission = pd.read_csv('sampleSubmission.csv')
new_labels = sample_submission.columns.values
new_labels[-1] = 'total_rentals'
sample_submission.columns = new_labels
print sample_submission.shape

my_submission = sample_submission.copy()
new_labels = my_submission.columns.values
new_labels[-1] = 'total_rentals'
my_submission.columns = new_labels
my_submission.total_rentals = np.round(y_cas+y_reg)
plt.plot(my_submission.total_rentals)
print my_submission.shape
my_submission[:5]









    



(6493, 2)
(6493, 2)






    Out[8]:






  
    
      
      datetime
      total_rentals
    
  
  
    
      0
       2011-01-20 00:00:00
       71
    
    
      1
       2011-01-20 01:00:00
       92
    
    
      2
       2011-01-20 02:00:00
       95
    
    
      3
       2011-01-20 03:00:00
       89
    
    
      4
       2011-01-20 04:00:00
       91
    
  

5 rows × 2 columns



In [9]:

    
new_labels = my_submission.columns.values
new_labels[-1] = 'count'
my_submission.columns = new_labels
my_submission.to_csv('poly_month-to-month.csv',index=False)



In [9]:



In [ ]:

	datetime	season	weather	temp	atemp	humidity	casual	registered	total_rentals
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1

	month	hour	season	weather	temp	atemp	humidity	temp2	atemp2	humidity2
0	1	0	1	1	9.84	14.395	81	3.136877	3.794074	9.000000
1	1	1	1	1	9.02	13.635	80	3.003331	3.692560	8.944272
2	1	2	1	1	9.02	13.635	80	3.003331	3.692560	8.944272
3	1	3	1	1	9.84	14.395	75	3.136877	3.794074	8.660254
4	1	4	1	1	9.84	14.395	75	3.136877	3.794074	8.660254

	datetime	month	hour	season	workingday	weather	temp	atemp	humidity	windspeed	temp2	atemp2	humidity2	windspeed2
0	2011-01-20 00:00:00	1	0	1	1	1	10.66	11.365	56	26.0027	3.264966	3.371202	7.483315	5.099284
1	2011-01-20 01:00:00	1	1	1	1	1	10.66	13.635	56	0.0000	3.264966	3.692560	7.483315	0.000000
2	2011-01-20 02:00:00	1	2	1	1	1	10.66	13.635	56	0.0000	3.264966	3.692560	7.483315	0.000000
3	2011-01-20 03:00:00	1	3	1	1	1	10.66	12.880	56	11.0014	3.264966	3.588872	7.483315	3.316836
4	2011-01-20 04:00:00	1	4	1	1	1	10.66	12.880	56	11.0014	3.264966	3.588872	7.483315	3.316836