In [1]:
%pylab inline
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn import linear_model
In [2]:
train = pd.read_csv('train.csv')
new_labels = train.columns.values
new_labels[-1] = 'total_rentals'
train.columns = new_labels
train[:5]
Out[2]:
In [3]:
plt.plot(train.casual)
plt.show()
plt.plot(train.registered)
plt.show()
plt.plot(train.total_rentals)
plt.show()
In [4]:
X = train.ix[:,1:9]
for i,col in enumerate(X.columns[4:8]):
X.insert(i+8, col + str(2), X[col]**.5)
dts = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in train.datetime]
months = [d.month for d in dts]
hours = [d.hour for d in dts]
X.insert(0,'hour',hours)
X.insert(0,'month',months)
X[:5]
Out[4]:
In [5]:
y = train.ix[:,-1:-4:-1]
y[:5]
Out[5]:
In [6]:
test_set = pd.read_csv('test.csv')
for i,col in enumerate(test_set.columns[5:9]):
test_set.insert(i+9, col + str(2), test_set[col]**.5)
dts = [datetime.datetime.strptime(d,'%Y-%m-%d %H:%M:%S') for d in test_set.datetime]
months = [d.month for d in dts]
hours = [d.hour for d in dts]
print len(months), len(hours)
test_set.insert(1,'hour',hours)
test_set.insert(1,'month',months)
print test_set.shape
test_set[:5]
Out[6]:
In [7]:
lm = linear_model.LinearRegression(fit_intercept=True)
y_tot = y_reg = y_cas = np.asarray([])
mean_score = 0
for m in range(1,13):
lm.fit(X[X.month == m],y[X.month == m])
mean_score += lm.score(X[X.month == m],y[X.month == m])/12
pred = lm.predict(test_set.ix[test_set.month == m,1:])
y_tot = np.append(y_tot, pred[:,0])
y_reg = np.append(y_reg, pred[:,1])
y_cas = np.append(y_cas, pred[:,2])
print mean_score
y_cas[y_cas < 0] = 0
y_reg[y_reg < 0] = 0
plt.plot(y_cas)
plt.plot(y_reg)
plt.plot(y_cas+y_reg)
Out[7]:
In [8]:
sample_submission = pd.read_csv('sampleSubmission.csv')
new_labels = sample_submission.columns.values
new_labels[-1] = 'total_rentals'
sample_submission.columns = new_labels
print sample_submission.shape
my_submission = sample_submission.copy()
new_labels = my_submission.columns.values
new_labels[-1] = 'total_rentals'
my_submission.columns = new_labels
my_submission.total_rentals = np.round(y_cas+y_reg)
plt.plot(my_submission.total_rentals)
print my_submission.shape
my_submission[:5]
Out[8]:
In [9]:
new_labels = my_submission.columns.values
new_labels[-1] = 'count'
my_submission.columns = new_labels
my_submission.to_csv('poly_month-to-month.csv',index=False)
In [9]:
In [ ]: