In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import *
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

import numpy as np

%matplotlib inline

In [2]:
data = pd.read_csv('bikes_train.csv') # , index_col=0 to use the datetime column as the DataFrame index
data.head()


Out[2]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count
0 2011-01-01 00:00:00 1 0 0 1 9.84 14.395 81 0.0 3 13 16
1 2011-01-01 01:00:00 1 0 0 1 9.02 13.635 80 0.0 8 32 40
2 2011-01-01 02:00:00 1 0 0 1 9.02 13.635 80 0.0 5 27 32
3 2011-01-01 03:00:00 1 0 0 1 9.84 14.395 75 0.0 3 10 13
4 2011-01-01 04:00:00 1 0 0 1 9.84 14.395 75 0.0 0 1 1

In [3]:
data.shape


Out[3]:
(10886, 12)

In [4]:
data.isnull().values.any()


Out[4]:
False

In [5]:
data.dropna()
data.shape


Out[5]:
(10886, 12)

In [6]:
def transform_data(data):
    data.datetime = data.datetime.apply(pd.to_datetime)
    data['month'] = data.datetime.apply(lambda x : x.month)
    data['hour'] = data.datetime.apply(lambda x : x.hour)
    data['day'] = data.datetime.apply(lambda x : x.timetuple().tm_yday)
    data['dayofweek'] = data.datetime.apply(lambda x : x.isoweekday())
    data.head()
    
transform_data(data)

In [7]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data)

In [8]:
train_data.shape


Out[8]:
(8164, 16)

In [9]:
test_data.shape


Out[9]:
(2722, 16)

In [10]:
fit_columns = ['day', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'hour', 'dayofweek']

In [11]:
model = Pipeline([#('scaler', Normalizer()),
                  ('poly', PolynomialFeatures(degree=3)),
                  ('linear', LinearRegression(fit_intercept=False))])
model_result = model.fit(train_data[fit_columns], train_data[["count"]])

In [12]:
test_predictions = model.predict(test_data[fit_columns])
test_predictions


Out[12]:
array([[  92.74113009],
       [ 310.58076482],
       [ 282.5618071 ],
       ..., 
       [  33.53932947],
       [ 172.08481741],
       [  32.29911987]])

In [13]:
flattened_predictions = np.ravel(test_predictions)

In [14]:
test_data["prediction"] = flattened_predictions
test_data.head(2)


D:\lib\anaconda\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
Out[14]:
datetime season holiday workingday weather temp atemp humidity windspeed casual registered count month hour day dayofweek prediction
10650 2012-12-10 04:00:00 4 0 1 2 15.58 19.695 94 7.0015 3 9 12 12 4 345 1 92.741130
4209 2011-10-07 10:00:00 4 0 1 1 21.32 25.000 68 6.0032 48 126 174 10 10 280 5 310.580765

In [15]:
print("The mean square error is %d" % np.mean((test_data["prediction"] - test_data["count"]) ** 2))


The mean square error is 13783

In [16]:
test_data[["day", "count", "prediction"]].groupby("day").aggregate(np.sum).plot(figsize=(20, 8))


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0xf29e3ed3c8>

In [17]:
test_data[["month", "count", "prediction"]].groupby("month").aggregate(np.sum).plot(figsize=(20, 8))


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0xf29dabaa90>

In [ ]:


In [18]:
model_result.named_steps["poly"].get_feature_names(fit_columns)


Out[18]:
['1',
 'day',
 'season',
 'holiday',
 'workingday',
 'weather',
 'temp',
 'atemp',
 'humidity',
 'windspeed',
 'month',
 'hour',
 'dayofweek',
 'day^2',
 'day season',
 'day holiday',
 'day workingday',
 'day weather',
 'day temp',
 'day atemp',
 'day humidity',
 'day windspeed',
 'day month',
 'day hour',
 'day dayofweek',
 'season^2',
 'season holiday',
 'season workingday',
 'season weather',
 'season temp',
 'season atemp',
 'season humidity',
 'season windspeed',
 'season month',
 'season hour',
 'season dayofweek',
 'holiday^2',
 'holiday workingday',
 'holiday weather',
 'holiday temp',
 'holiday atemp',
 'holiday humidity',
 'holiday windspeed',
 'holiday month',
 'holiday hour',
 'holiday dayofweek',
 'workingday^2',
 'workingday weather',
 'workingday temp',
 'workingday atemp',
 'workingday humidity',
 'workingday windspeed',
 'workingday month',
 'workingday hour',
 'workingday dayofweek',
 'weather^2',
 'weather temp',
 'weather atemp',
 'weather humidity',
 'weather windspeed',
 'weather month',
 'weather hour',
 'weather dayofweek',
 'temp^2',
 'temp atemp',
 'temp humidity',
 'temp windspeed',
 'temp month',
 'temp hour',
 'temp dayofweek',
 'atemp^2',
 'atemp humidity',
 'atemp windspeed',
 'atemp month',
 'atemp hour',
 'atemp dayofweek',
 'humidity^2',
 'humidity windspeed',
 'humidity month',
 'humidity hour',
 'humidity dayofweek',
 'windspeed^2',
 'windspeed month',
 'windspeed hour',
 'windspeed dayofweek',
 'month^2',
 'month hour',
 'month dayofweek',
 'hour^2',
 'hour dayofweek',
 'dayofweek^2',
 'day^3',
 'day^2 season',
 'day^2 holiday',
 'day^2 workingday',
 'day^2 weather',
 'day^2 temp',
 'day^2 atemp',
 'day^2 humidity',
 'day^2 windspeed',
 'day^2 month',
 'day^2 hour',
 'day^2 dayofweek',
 'day season^2',
 'day season holiday',
 'day season workingday',
 'day season weather',
 'day season temp',
 'day season atemp',
 'day season humidity',
 'day season windspeed',
 'day season month',
 'day season hour',
 'day season dayofweek',
 'day holiday^2',
 'day holiday workingday',
 'day holiday weather',
 'day holiday temp',
 'day holiday atemp',
 'day holiday humidity',
 'day holiday windspeed',
 'day holiday month',
 'day holiday hour',
 'day holiday dayofweek',
 'day workingday^2',
 'day workingday weather',
 'day workingday temp',
 'day workingday atemp',
 'day workingday humidity',
 'day workingday windspeed',
 'day workingday month',
 'day workingday hour',
 'day workingday dayofweek',
 'day weather^2',
 'day weather temp',
 'day weather atemp',
 'day weather humidity',
 'day weather windspeed',
 'day weather month',
 'day weather hour',
 'day weather dayofweek',
 'day temp^2',
 'day temp atemp',
 'day temp humidity',
 'day temp windspeed',
 'day temp month',
 'day temp hour',
 'day temp dayofweek',
 'day atemp^2',
 'day atemp humidity',
 'day atemp windspeed',
 'day atemp month',
 'day atemp hour',
 'day atemp dayofweek',
 'day humidity^2',
 'day humidity windspeed',
 'day humidity month',
 'day humidity hour',
 'day humidity dayofweek',
 'day windspeed^2',
 'day windspeed month',
 'day windspeed hour',
 'day windspeed dayofweek',
 'day month^2',
 'day month hour',
 'day month dayofweek',
 'day hour^2',
 'day hour dayofweek',
 'day dayofweek^2',
 'season^3',
 'season^2 holiday',
 'season^2 workingday',
 'season^2 weather',
 'season^2 temp',
 'season^2 atemp',
 'season^2 humidity',
 'season^2 windspeed',
 'season^2 month',
 'season^2 hour',
 'season^2 dayofweek',
 'season holiday^2',
 'season holiday workingday',
 'season holiday weather',
 'season holiday temp',
 'season holiday atemp',
 'season holiday humidity',
 'season holiday windspeed',
 'season holiday month',
 'season holiday hour',
 'season holiday dayofweek',
 'season workingday^2',
 'season workingday weather',
 'season workingday temp',
 'season workingday atemp',
 'season workingday humidity',
 'season workingday windspeed',
 'season workingday month',
 'season workingday hour',
 'season workingday dayofweek',
 'season weather^2',
 'season weather temp',
 'season weather atemp',
 'season weather humidity',
 'season weather windspeed',
 'season weather month',
 'season weather hour',
 'season weather dayofweek',
 'season temp^2',
 'season temp atemp',
 'season temp humidity',
 'season temp windspeed',
 'season temp month',
 'season temp hour',
 'season temp dayofweek',
 'season atemp^2',
 'season atemp humidity',
 'season atemp windspeed',
 'season atemp month',
 'season atemp hour',
 'season atemp dayofweek',
 'season humidity^2',
 'season humidity windspeed',
 'season humidity month',
 'season humidity hour',
 'season humidity dayofweek',
 'season windspeed^2',
 'season windspeed month',
 'season windspeed hour',
 'season windspeed dayofweek',
 'season month^2',
 'season month hour',
 'season month dayofweek',
 'season hour^2',
 'season hour dayofweek',
 'season dayofweek^2',
 'holiday^3',
 'holiday^2 workingday',
 'holiday^2 weather',
 'holiday^2 temp',
 'holiday^2 atemp',
 'holiday^2 humidity',
 'holiday^2 windspeed',
 'holiday^2 month',
 'holiday^2 hour',
 'holiday^2 dayofweek',
 'holiday workingday^2',
 'holiday workingday weather',
 'holiday workingday temp',
 'holiday workingday atemp',
 'holiday workingday humidity',
 'holiday workingday windspeed',
 'holiday workingday month',
 'holiday workingday hour',
 'holiday workingday dayofweek',
 'holiday weather^2',
 'holiday weather temp',
 'holiday weather atemp',
 'holiday weather humidity',
 'holiday weather windspeed',
 'holiday weather month',
 'holiday weather hour',
 'holiday weather dayofweek',
 'holiday temp^2',
 'holiday temp atemp',
 'holiday temp humidity',
 'holiday temp windspeed',
 'holiday temp month',
 'holiday temp hour',
 'holiday temp dayofweek',
 'holiday atemp^2',
 'holiday atemp humidity',
 'holiday atemp windspeed',
 'holiday atemp month',
 'holiday atemp hour',
 'holiday atemp dayofweek',
 'holiday humidity^2',
 'holiday humidity windspeed',
 'holiday humidity month',
 'holiday humidity hour',
 'holiday humidity dayofweek',
 'holiday windspeed^2',
 'holiday windspeed month',
 'holiday windspeed hour',
 'holiday windspeed dayofweek',
 'holiday month^2',
 'holiday month hour',
 'holiday month dayofweek',
 'holiday hour^2',
 'holiday hour dayofweek',
 'holiday dayofweek^2',
 'workingday^3',
 'workingday^2 weather',
 'workingday^2 temp',
 'workingday^2 atemp',
 'workingday^2 humidity',
 'workingday^2 windspeed',
 'workingday^2 month',
 'workingday^2 hour',
 'workingday^2 dayofweek',
 'workingday weather^2',
 'workingday weather temp',
 'workingday weather atemp',
 'workingday weather humidity',
 'workingday weather windspeed',
 'workingday weather month',
 'workingday weather hour',
 'workingday weather dayofweek',
 'workingday temp^2',
 'workingday temp atemp',
 'workingday temp humidity',
 'workingday temp windspeed',
 'workingday temp month',
 'workingday temp hour',
 'workingday temp dayofweek',
 'workingday atemp^2',
 'workingday atemp humidity',
 'workingday atemp windspeed',
 'workingday atemp month',
 'workingday atemp hour',
 'workingday atemp dayofweek',
 'workingday humidity^2',
 'workingday humidity windspeed',
 'workingday humidity month',
 'workingday humidity hour',
 'workingday humidity dayofweek',
 'workingday windspeed^2',
 'workingday windspeed month',
 'workingday windspeed hour',
 'workingday windspeed dayofweek',
 'workingday month^2',
 'workingday month hour',
 'workingday month dayofweek',
 'workingday hour^2',
 'workingday hour dayofweek',
 'workingday dayofweek^2',
 'weather^3',
 'weather^2 temp',
 'weather^2 atemp',
 'weather^2 humidity',
 'weather^2 windspeed',
 'weather^2 month',
 'weather^2 hour',
 'weather^2 dayofweek',
 'weather temp^2',
 'weather temp atemp',
 'weather temp humidity',
 'weather temp windspeed',
 'weather temp month',
 'weather temp hour',
 'weather temp dayofweek',
 'weather atemp^2',
 'weather atemp humidity',
 'weather atemp windspeed',
 'weather atemp month',
 'weather atemp hour',
 'weather atemp dayofweek',
 'weather humidity^2',
 'weather humidity windspeed',
 'weather humidity month',
 'weather humidity hour',
 'weather humidity dayofweek',
 'weather windspeed^2',
 'weather windspeed month',
 'weather windspeed hour',
 'weather windspeed dayofweek',
 'weather month^2',
 'weather month hour',
 'weather month dayofweek',
 'weather hour^2',
 'weather hour dayofweek',
 'weather dayofweek^2',
 'temp^3',
 'temp^2 atemp',
 'temp^2 humidity',
 'temp^2 windspeed',
 'temp^2 month',
 'temp^2 hour',
 'temp^2 dayofweek',
 'temp atemp^2',
 'temp atemp humidity',
 'temp atemp windspeed',
 'temp atemp month',
 'temp atemp hour',
 'temp atemp dayofweek',
 'temp humidity^2',
 'temp humidity windspeed',
 'temp humidity month',
 'temp humidity hour',
 'temp humidity dayofweek',
 'temp windspeed^2',
 'temp windspeed month',
 'temp windspeed hour',
 'temp windspeed dayofweek',
 'temp month^2',
 'temp month hour',
 'temp month dayofweek',
 'temp hour^2',
 'temp hour dayofweek',
 'temp dayofweek^2',
 'atemp^3',
 'atemp^2 humidity',
 'atemp^2 windspeed',
 'atemp^2 month',
 'atemp^2 hour',
 'atemp^2 dayofweek',
 'atemp humidity^2',
 'atemp humidity windspeed',
 'atemp humidity month',
 'atemp humidity hour',
 'atemp humidity dayofweek',
 'atemp windspeed^2',
 'atemp windspeed month',
 'atemp windspeed hour',
 'atemp windspeed dayofweek',
 'atemp month^2',
 'atemp month hour',
 'atemp month dayofweek',
 'atemp hour^2',
 'atemp hour dayofweek',
 'atemp dayofweek^2',
 'humidity^3',
 'humidity^2 windspeed',
 'humidity^2 month',
 'humidity^2 hour',
 'humidity^2 dayofweek',
 'humidity windspeed^2',
 'humidity windspeed month',
 'humidity windspeed hour',
 'humidity windspeed dayofweek',
 'humidity month^2',
 'humidity month hour',
 'humidity month dayofweek',
 'humidity hour^2',
 'humidity hour dayofweek',
 'humidity dayofweek^2',
 'windspeed^3',
 'windspeed^2 month',
 'windspeed^2 hour',
 'windspeed^2 dayofweek',
 'windspeed month^2',
 'windspeed month hour',
 'windspeed month dayofweek',
 'windspeed hour^2',
 'windspeed hour dayofweek',
 'windspeed dayofweek^2',
 'month^3',
 'month^2 hour',
 'month^2 dayofweek',
 'month hour^2',
 'month hour dayofweek',
 'month dayofweek^2',
 'hour^3',
 'hour^2 dayofweek',
 'hour dayofweek^2',
 'dayofweek^3']

In [19]:
dataw = data[["day", "temp", "humidity", "weather", "holiday", "windspeed", "count"]].groupby("day").agg({
    "temp": np.mean, 
    "weather": np.mean, 
    "holiday": np.mean, 
    "windspeed": np.mean, 
    "humidity": np.mean,
    "day": np.mean,
    "count": np.sum})

In [20]:
fit_columnsw = ["day", "temp", "humidity", "weather", "holiday", "windspeed"]
dataw.head(2)


Out[20]:
temp weather holiday windspeed humidity day count
day
1 14.640417 1.458333 0.0 11.812519 74.916667 1 3279
2 13.048696 1.521739 0.5 19.369846 53.869565 2 2752

In [21]:
train_dataw, test_dataw = train_test_split(dataw)

In [22]:
modelw = Pipeline([('scaler', Normalizer()),
                  ('poly', PolynomialFeatures(degree=1)),
                  ('linear', LinearRegression(fit_intercept=False))])
model_resultw = modelw.fit(train_dataw[fit_columnsw], train_dataw[["count"]])

In [23]:
predictionsw = modelw.predict(test_dataw[fit_columnsw])
predictionsw


Out[23]:
array([[ 11274.84233396],
       [  9296.46525582],
       [  9425.10862322],
       [ 10305.68323847],
       [  7906.43954365],
       [  9951.73084972],
       [ 12061.98785142],
       [ 11205.61900023],
       [ 11273.06994056],
       [  8471.98437706],
       [  9911.13891895],
       [  9471.7283559 ],
       [ 11001.42080575],
       [  9599.4420017 ],
       [  3160.67900987],
       [  8835.94665948],
       [  9398.26320353],
       [  9389.04194047],
       [  8882.83578926],
       [  8791.24655846],
       [  9000.94759403],
       [ 10475.2649449 ],
       [ 12347.92889509],
       [ 11604.49285759],
       [  9652.67255795],
       [  9579.87201882],
       [ 11436.23693855],
       [ 11824.78076976],
       [ 11555.23903434],
       [ 10085.90321412],
       [ 10942.00249148],
       [  8905.76281277],
       [ 10496.45542164],
       [  9640.30016888],
       [  8914.31235255],
       [ 10957.67535567],
       [ 11525.11494078],
       [ 10618.35375149],
       [ 11811.1220807 ],
       [  9508.776891  ],
       [ 11006.12577349],
       [  8102.69926801],
       [ 11596.73182128],
       [  2370.52545371],
       [ 10750.84907624],
       [ 10473.99384288],
       [ 10189.75613157],
       [  9512.69093799],
       [  8990.32177859],
       [  8531.38165666],
       [  9568.12738901],
       [  8765.31536849],
       [ 11397.42936587],
       [  6466.00617799],
       [  9177.61513133],
       [  7294.94310093],
       [  9311.39560549],
       [ 10504.90132895],
       [ 10372.73274654],
       [  4293.95798422]])

In [24]:
test_dataw["prediction"] = np.ravel(predictionsw)


D:\lib\anaconda\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

In [25]:
print("The mean square error is %d" % np.mean((test_dataw["prediction"] - test_dataw["count"]) ** 2))


The mean square error is 4350553

In [26]:
test_dataw[["day", "count", "prediction"]].groupby("day").aggregate(np.sum).plot(figsize=(20, 8))


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0xf29e094d68>

In [27]:
model_resultw.named_steps["linear"].coef_


Out[27]:
array([[  13981.76092557,   -3609.39500685,   32026.19999912,
          -7187.98097536, -221576.55048921,    -382.73112712,
         -10182.11957285]])

In [28]:
data.corr()


Out[28]:
season holiday workingday weather temp atemp humidity windspeed casual registered count month hour day dayofweek
season 1.000000 0.029368 -0.008126 0.008879 0.258689 0.264744 0.190610 -0.147121 0.096758 0.164011 0.163439 0.971524 -0.006546 0.970196 -0.010553
holiday 0.029368 1.000000 -0.250491 -0.007074 0.000295 -0.005215 0.001929 0.008409 0.043799 -0.020956 -0.005393 0.001731 -0.000354 0.001134 -0.191832
workingday -0.008126 -0.250491 1.000000 0.033772 0.029966 0.024660 -0.010880 0.013373 -0.319111 0.119460 0.011594 -0.003394 0.002780 -0.003024 -0.704267
weather 0.008879 -0.007074 0.033772 1.000000 -0.055035 -0.055376 0.406244 0.007261 -0.135918 -0.109340 -0.128655 0.012144 -0.022740 0.011746 -0.047692
temp 0.258689 0.000295 0.029966 -0.055035 1.000000 0.984948 -0.064949 -0.017852 0.467097 0.318571 0.394454 0.257589 0.145430 0.255887 -0.038466
atemp 0.264744 -0.005215 0.024660 -0.055376 0.984948 1.000000 -0.043536 -0.057473 0.462067 0.314635 0.389784 0.264173 0.140343 0.262245 -0.040235
humidity 0.190610 0.001929 -0.010880 0.406244 -0.064949 -0.043536 1.000000 -0.318607 -0.348187 -0.265458 -0.317371 0.204537 -0.278011 0.203155 -0.026507
windspeed -0.147121 0.008409 0.013373 0.007261 -0.017852 -0.057473 -0.318607 1.000000 0.092276 0.091052 0.101369 -0.150192 0.146631 -0.148062 -0.024804
casual 0.096758 0.043799 -0.319111 -0.135918 0.467097 0.462067 -0.348187 0.092276 1.000000 0.497250 0.690414 0.092722 0.302045 0.092957 0.246959
registered 0.164011 -0.020956 0.119460 -0.109340 0.318571 0.314635 -0.265458 0.091052 0.497250 1.000000 0.970948 0.169451 0.380540 0.170805 -0.084427
count 0.163439 -0.005393 0.011594 -0.128655 0.394454 0.389784 -0.317371 0.101369 0.690414 0.970948 1.000000 0.166862 0.400601 0.168056 -0.002283
month 0.971524 0.001731 -0.003394 0.012144 0.257589 0.264173 0.204537 -0.150192 0.092722 0.169451 0.166862 1.000000 -0.006818 0.998616 -0.002266
hour -0.006546 -0.000354 0.002780 -0.022740 0.145430 0.140343 -0.278011 0.146631 0.302045 0.380540 0.400601 -0.006818 1.000000 -0.006735 -0.002925
day 0.970196 0.001134 -0.003024 0.011746 0.255887 0.262245 0.203155 -0.148062 0.092957 0.170805 0.168056 0.998616 -0.006735 1.000000 -0.002786
dayofweek -0.010553 -0.191832 -0.704267 -0.047692 -0.038466 -0.040235 -0.026507 -0.024804 0.246959 -0.084427 -0.002283 -0.002266 -0.002925 -0.002786 1.000000

In [29]:
import seaborn as sb

correlation_matrix = data[fit_columns].corr(method='pearson')
plt.subplots(figsize = (15, 15))

sb.heatmap(correlation_matrix, vmax=.8, square=True, annot=True)
plt.show()
fig,(ax1, ax2, ax3) = plt.subplots(ncols=3)
fig.set_size_inches(12, 5)
sb.regplot(x="hour", y="count", data=data, ax=ax1)
sb.regplot(x="temp", y="count", data=data, ax=ax2)
sb.regplot(x="humidity", y="count", data=data, ax=ax3)


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0xf2a05055c0>

In [ ]: