In [28]:
%pylab inline
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import linear_model


Populating the interactive namespace from numpy and matplotlib

In [29]:
train = pd.read_csv('train.csv')
new_labels = train.columns.values
new_labels[-1] = 'total_rentals'
train.columns = new_labels
train[:5]

print train.ix[:,9:12].shape

fig,ax = plt.subplots(1,3)
ax[0].plot(train.ix[:,9])
ax[1].plot(train.ix[:,10])
ax[2].plot(train.ix[:,11])
plt.show()


(10886, 3)

In [30]:
X = train.ix[:,1:9]
hours = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S').hour for d in train.datetime]
X.insert(0,'time',hours)
y = train.ix[:,9:12]
print X.shape, y.shape

pca2 = PCA(n_components=2)
X_2 = pca2.fit_transform(X)
print X_2.shape


(10886, 9) (10886, 3)
(10886, 2)

In [31]:
test_set = pd.read_csv('test.csv')
hours = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S').hour for d in test_set.datetime]
test_set.insert(1,'time',hours)
print test_set.shape
test_set[:5]


(6493, 10)
Out[31]:
datetime time season holiday workingday weather temp atemp humidity windspeed
0 2011-01-20 00:00:00 0 1 0 1 1 10.66 11.365 56 26.0027
1 2011-01-20 01:00:00 1 1 0 1 1 10.66 13.635 56 0.0000
2 2011-01-20 02:00:00 2 1 0 1 1 10.66 13.635 56 0.0000
3 2011-01-20 03:00:00 3 1 0 1 1 10.66 12.880 56 11.0014
4 2011-01-20 04:00:00 4 1 0 1 1 10.66 12.880 56 11.0014

5 rows × 10 columns


In [32]:
lm = linear_model.LinearRegression()
lm.fit(X,y)
print lm.score(X,y)
y_ = lm.predict(test_set.ix[:,1:])
fig,ax = plt.subplots(1,4)
for i,d in enumerate(y_.T):
    ax[i].plot(d)
ax[3].plot(y.ix[:,0]+y.ix[:,1])


0.316606687211
Out[32]:
[<matplotlib.lines.Line2D at 0x116ac6fd0>]

Write out the submission!


In [26]:
sample_submission = pd.read_csv('sampleSubmission.csv')
new_labels = sample_submission.columns.values
new_labels[-1] = 'total_rentals'
sample_submission.columns = new_labels
sample_submission[:5]

my_submission = sample_submission.copy()
my_submission.total_rentals = np.round(y_[:,2])
my_submission.loc[my_submission.total_rentals < 0,'total_rentals'] = 0
plt.plot(my_submission.total_rentals)
plt.show()
print my_submission.shape
my_submission[:5]


(6493, 2)
Out[26]:
datetime total_rentals
0 2011-01-20 00:00:00 15
1 2011-01-20 01:00:00 23
2 2011-01-20 02:00:00 30
3 2011-01-20 03:00:00 39
4 2011-01-20 04:00:00 46

5 rows × 2 columns


In [27]:
new_labels = my_submission.columns.values
new_labels[-1] = 'count'
my_submission.columns = new_labels
my_submission.to_csv('my_submission.csv',index=False)

In [ ]: