In [1]:
%pylab inline
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import linear_model


Populating the interactive namespace from numpy and matplotlib

In [2]:
train = pd.read_csv('train.csv')
new_labels = train.columns.values
new_labels[-1] = 'total_rentals'
train.columns = new_labels
train[:5]


Out[2]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered total_rentals
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0 3 13 16
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0 8 32 40
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0 5 27 32
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0 3 10 13
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0 0 1 1

5 rows × 12 columns


In [3]:
plt.plot(train.casual)
plt.show()
plt.plot(train.registered)
plt.show()
plt.plot(train.total_rentals)
plt.show()



In [21]:
X = train.ix[:,1:9]
dts = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in train.datetime]
months = [d.month for d in dts]
hours = [d.hour for d in dts]
X.insert(0,'hour',hours)
X.insert(0,'month',months)

X[:5]


Out[21]:
month hour season holiday workingday weather temp atemp humidity windspeed
0 1 0 1 0 0 1 9.84 14.395 81 0
1 1 1 1 0 0 1 9.02 13.635 80 0
2 1 2 1 0 0 1 9.02 13.635 80 0
3 1 3 1 0 0 1 9.84 14.395 75 0
4 1 4 1 0 0 1 9.84 14.395 75 0

5 rows × 10 columns


In [22]:
y = train.ix[:,-1:-4:-1]
y[:5]


Out[22]:
total_rentals registered casual
0 16 13 3
1 40 32 8
2 32 27 5
3 13 10 3
4 1 1 0

5 rows × 3 columns


In [23]:
test_set = pd.read_csv('test.csv')
dts = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in test_set.datetime]
months = [d.month for d in dts]
hours = [d.hour for d in dts]
print len(months), len(hours)
test_set.insert(1,'hour',hours)
test_set.insert(1,'month',months)
print test_set.shape
test_set[:5]


6493 6493
(6493, 11)
Out[23]:
datetime month hour season holiday workingday weather temp atemp humidity windspeed
0 2011-01-20 00:00:00 1 0 1 0 1 1 10.66 11.365 56 26.0027
1 2011-01-20 01:00:00 1 1 1 0 1 1 10.66 13.635 56 0.0000
2 2011-01-20 02:00:00 1 2 1 0 1 1 10.66 13.635 56 0.0000
3 2011-01-20 03:00:00 1 3 1 0 1 1 10.66 12.880 56 11.0014
4 2011-01-20 04:00:00 1 4 1 0 1 1 10.66 12.880 56 11.0014

5 rows × 11 columns


In [30]:
print X[X.month == 1].shape, y[X.month == 1].shape


(884, 10) (884, 3)

In [36]:
lm = linear_model.BayesianRidge(compute_score=True)
y_tot = y_reg = y_cas = np.asarray([])
mean_score = 0
print X.shape, y.shape
for m in range(1,13):
    lm.fit(X[X.month == m],y.ix[X.month == m,0])
    mean_score += lm.score(X[X.month == m],y.ix[X.month == m,0])/12
    pred = lm.predict(test_set.ix[test_set.month == m,1:])
    y_tot = np.append(y_tot, pred)
    
    lm.fit(X[X.month == m],y.ix[X.month == m,1])
    # mean_score += lm.score(X[X.month == m],y.ix[X.month == m,0])/12
    pred = lm.predict(test_set.ix[test_set.month == m,1:])
    y_reg = np.append(y_reg, pred)
    
    lm.fit(X[X.month == m],y.ix[X.month == m,2])
    # mean_score += lm.score(X[X.month == m],y.ix[X.month == m,0])/12
    pred = lm.predict(test_set.ix[test_set.month == m,1:])
    y_cas = np.append(y_cas, pred)
    
print mean_score
    
y_cas[y_cas < 0] = 0
y_reg[y_reg < 0] = 0    
    
plt.plot(y_cas)
plt.plot(y_reg)
plt.plot(y_cas+y_reg)
plt.plot(y_tot)
plt.show()


(10886, 10) (10886, 3)
0.333684657839

Write out the submission!


In [37]:
sample_submission = pd.read_csv('sampleSubmission.csv')
new_labels = sample_submission.columns.values
new_labels[-1] = 'total_rentals'
sample_submission.columns = new_labels
print sample_submission.shape

my_submission = sample_submission.copy()
new_labels = my_submission.columns.values
new_labels[-1] = 'total_rentals'
my_submission.columns = new_labels
my_submission.total_rentals = np.round(y_cas+y_reg)
plt.plot(my_submission.total_rentals)
print my_submission.shape
my_submission[:5]


(6493, 2)
(6493, 2)
Out[37]:
datetime total_rentals
0 2011-01-20 00:00:00 78
1 2011-01-20 01:00:00 98
2 2011-01-20 02:00:00 101
3 2011-01-20 03:00:00 95
4 2011-01-20 04:00:00 98

5 rows × 2 columns


In [39]:
new_labels = my_submission.columns.values
new_labels[-1] = 'count'
my_submission.columns = new_labels
my_submission.to_csv('m2m-bayesianridge.csv',index=False)

In [ ]: