In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
%matplotlib inline
In [2]:
data = pd.read_csv('bikes_train.csv') # , index_col=0 to use the datetime column as the DataFrame index
data.head()
Out[2]:
In [3]:
data.shape
Out[3]:
In [4]:
data.isnull().values.any()
Out[4]:
In [5]:
data.dropna()
data.shape
Out[5]:
In [6]:
def transform_data(data):
data.datetime = data.datetime.apply(pd.to_datetime)
data['month'] = data.datetime.apply(lambda x : x.month)
data['hour'] = data.datetime.apply(lambda x : x.hour)
data['day'] = data.datetime.apply(lambda x : x.timetuple().tm_yday)
data['dayofweek'] = data.datetime.apply(lambda x : x.isoweekday())
data.head()
transform_data(data)
In [7]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(data)
In [8]:
fit_columns = ['season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'month', 'hour', 'dayofweek']
In [9]:
from sklearn.preprocessing import *
model = Pipeline([#('scaler', Normalizer()),
('poly', PolynomialFeatures(degree=1)),
('linear', LinearRegression(fit_intercept=False))])
model.fit(train_data[fit_columns], train_data[["count"]])
Out[9]:
In [10]:
test_predictions = model.predict(test_data[fit_columns])
test_predictions.shape
Out[10]:
In [11]:
test_data["prediction"] = np.ravel(test_predictions)
test_data.head()
Out[11]:
In [12]:
print("The mean square error is %d" % np.mean((test_data["prediction"] - test_data["count"]) ** 2))
In [13]:
test_data[["day", "count", "prediction"]].groupby("day").aggregate(np.sum).plot(figsize=(20, 8))
Out[13]:
In [14]:
test_data[["month", "count", "prediction"]].groupby("month").aggregate(np.sum).plot(figsize=(20, 8))
Out[14]:
In [15]:
test_data[["dayofweek", "count", "prediction"]].groupby("dayofweek").aggregate(np.sum).plot(figsize=(20, 8))
Out[15]: