notebook.community

Edit and run



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

import numpy as np

%matplotlib inline



In [2]:

    
data = pd.read_csv('bikes_train.csv') # , index_col=0 to use the datetime column as the DataFrame index
data.head()









    Out[2]:






  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
    
  
  
    
      0
      2011-01-01 00:00:00
      1
      0
      0
      1
      9.84
      14.395
      81
      0.0
      3
      13
      16
    
    
      1
      2011-01-01 01:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0
      8
      32
      40
    
    
      2
      2011-01-01 02:00:00
      1
      0
      0
      1
      9.02
      13.635
      80
      0.0
      5
      27
      32
    
    
      3
      2011-01-01 03:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0
      3
      10
      13
    
    
      4
      2011-01-01 04:00:00
      1
      0
      0
      1
      9.84
      14.395
      75
      0.0
      0
      1
      1



In [3]:

    
data.shape









    Out[3]:





(10886, 12)



In [4]:

    
data.isnull().values.any()









    Out[4]:





False



In [5]:

    
data.dropna()
data.shape









    Out[5]:





(10886, 12)



In [6]:

    
def transform_data(data):
    data.datetime = data.datetime.apply(pd.to_datetime)
    data['month'] = data.datetime.apply(lambda x : x.month)
    data['hour'] = data.datetime.apply(lambda x : x.hour)
    data['day'] = data.datetime.apply(lambda x : x.timetuple().tm_yday)
    data['dayofweek'] = data.datetime.apply(lambda x : x.isoweekday())
    data.head()
    
transform_data(data)



In [7]:

    
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data)



In [8]:

    
fit_columns = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'hour', 'dayofweek']



In [9]:

    
from sklearn.preprocessing import *

model = Pipeline([#('scaler', Normalizer()),
                  ('poly', PolynomialFeatures(degree=1)),
                  ('linear', LinearRegression(fit_intercept=False))])

model.fit(train_data[fit_columns], train_data[["count"]])









    Out[9]:





Pipeline(steps=[('poly', PolynomialFeatures(degree=1, include_bias=True, interaction_only=False)), ('linear', LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False))])



In [10]:

    
test_predictions = model.predict(test_data[fit_columns])
test_predictions.shape









    Out[10]:





(2722, 1)



In [11]:

    
test_data["prediction"] = np.ravel(test_predictions)
test_data.head()









    



D:\lib\anaconda\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':






    Out[11]:






  
    
      
      datetime
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
      month
      hour
      day
      dayofweek
      prediction
    
  
  
    
      5574
      2012-01-07 09:00:00
      1
      0
      0
      1
      9.02
      11.365
      80
      8.9981
      14
      116
      130
      1
      9
      7
      6
      7.381190
    
    
      8285
      2012-07-06 14:00:00
      3
      0
      1
      1
      36.90
      40.150
      31
      11.0014
      91
      184
      275
      7
      14
      188
      5
      400.870569
    
    
      8875
      2012-08-12 04:00:00
      3
      0
      0
      1
      26.24
      30.305
      69
      6.0032
      2
      8
      10
      8
      4
      225
      7
      170.834261
    
    
      8375
      2012-07-10 08:00:00
      3
      0
      1
      1
      29.52
      34.850
      74
      8.9981
      34
      615
      649
      7
      8
      192
      2
      207.195152
    
    
      5780
      2012-01-16 00:00:00
      1
      1
      0
      1
      5.74
      7.575
      46
      8.9981
      2
      23
      25
      1
      0
      16
      1
      -9.851459



In [12]:

    
print("The mean square error is %d" % np.mean((test_data["prediction"] - test_data["count"]) ** 2))









    



The mean square error is 20986



In [13]:

    
test_data[["day", "count", "prediction"]].groupby("day").aggregate(np.sum).plot(figsize=(20, 8))









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x9a3ae01e80>



In [14]:

    
test_data[["month", "count", "prediction"]].groupby("month").aggregate(np.sum).plot(figsize=(20, 8))









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x9a3af4df60>



In [15]:

    
test_data[["dayofweek", "count", "prediction"]].groupby("dayofweek").aggregate(np.sum).plot(figsize=(20, 8))









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x9a3b290358>

	datetime	season	weather	temp	atemp	humidity	casual	registered	count
0	2011-01-01 00:00:00	1	1	9.84	14.395	81	3	13	16
1	2011-01-01 01:00:00	1	1	9.02	13.635	80	8	32	40
2	2011-01-01 02:00:00	1	1	9.02	13.635	80	5	27	32
3	2011-01-01 03:00:00	1	1	9.84	14.395	75	3	10	13
4	2011-01-01 04:00:00	1	1	9.84	14.395	75	0	1	1

	datetime	season	holiday	workingday	weather	temp	atemp	humidity	windspeed	casual	registered	count	month	hour	day	dayofweek	prediction
5574	2012-01-07 09:00:00	1	0	0	1	9.02	11.365	80	8.9981	14	116	130	1	9	7	6	7.381190
8285	2012-07-06 14:00:00	3	0	1	1	36.90	40.150	31	11.0014	91	184	275	7	14	188	5	400.870569
8875	2012-08-12 04:00:00	3	0	0	1	26.24	30.305	69	6.0032	2	8	10	8	4	225	7	170.834261
8375	2012-07-10 08:00:00	3	0	1	1	29.52	34.850	74	8.9981	34	615	649	7	8	192	2	207.195152
5780	2012-01-16 00:00:00	1	1	0	1	5.74	7.575	46	8.9981	2	23	25	1	0	16	1	-9.851459