In [28]:
%pylab inline
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import linear_model
In [29]:
train = pd.read_csv('train.csv')
new_labels = train.columns.values
new_labels[-1] = 'total_rentals'
train.columns = new_labels
train[:5]
print train.ix[:,9:12].shape
fig,ax = plt.subplots(1,3)
ax[0].plot(train.ix[:,9])
ax[1].plot(train.ix[:,10])
ax[2].plot(train.ix[:,11])
plt.show()
In [30]:
X = train.ix[:,1:9]
hours = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S').hour for d in train.datetime]
X.insert(0,'time',hours)
y = train.ix[:,9:12]
print X.shape, y.shape
pca2 = PCA(n_components=2)
X_2 = pca2.fit_transform(X)
print X_2.shape
In [31]:
test_set = pd.read_csv('test.csv')
hours = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S').hour for d in test_set.datetime]
test_set.insert(1,'time',hours)
print test_set.shape
test_set[:5]
Out[31]:
In [32]:
lm = linear_model.LinearRegression()
lm.fit(X,y)
print lm.score(X,y)
y_ = lm.predict(test_set.ix[:,1:])
fig,ax = plt.subplots(1,4)
for i,d in enumerate(y_.T):
ax[i].plot(d)
ax[3].plot(y.ix[:,0]+y.ix[:,1])
Out[32]:
In [26]:
sample_submission = pd.read_csv('sampleSubmission.csv')
new_labels = sample_submission.columns.values
new_labels[-1] = 'total_rentals'
sample_submission.columns = new_labels
sample_submission[:5]
my_submission = sample_submission.copy()
my_submission.total_rentals = np.round(y_[:,2])
my_submission.loc[my_submission.total_rentals < 0,'total_rentals'] = 0
plt.plot(my_submission.total_rentals)
plt.show()
print my_submission.shape
my_submission[:5]
Out[26]:
In [27]:
new_labels = my_submission.columns.values
new_labels[-1] = 'count'
my_submission.columns = new_labels
my_submission.to_csv('my_submission.csv',index=False)
In [ ]: