In [1]:
# !pip install -U kaggle
# register the token in you kaggle profile & save it to ~/.kaggle/kaggle.json
# !kaggle competitions download -c bike-sharing-demand
In [19]:
import pandas as pd
from sklearn import linear_model
from scipy import stats
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer
import numpy as np
from dateutil.parser import parse
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
%matplotlib inline
In [3]:
# to prevent warining inside sklearn code
pd.options.mode.chained_assignment = None
In [4]:
df = pd.read_csv("train.csv")
df.shape
Out[4]:
In [5]:
df.head()
Out[5]:
At first, we need custom score function, described in task. https://www.kaggle.com/c/bike-sharing-demand/overview/evaluation
In [6]:
def rmsle(y_true, y_pred):
y_pred_clipped = np.clip(y_pred, 0., None)
return mean_squared_error(np.log1p(y_true), np.log1p(y_pred_clipped)) ** .5
Let's start with the exisiting features and simple linear regression.
All that feature extractors and grid search would be more clear further.
In [7]:
class SimpleFeatureExtractor(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
return X[["holiday", "workingday", "season", "weather", "temp", "atemp", "humidity", "windspeed"]].values
In [8]:
exctractor = SimpleFeatureExtractor()
clf = Pipeline([
("extractor", exctractor),
("regression", linear_model.LinearRegression()),
])
param_grid = {}
scorerer = make_scorer(rmsle, greater_is_better=False)
researcher = GridSearchCV(clf, param_grid, scoring=scorerer, cv=5, n_jobs=4, verbose=1, refit=False)
researcher.fit(df, df["count"].values)
Out[8]:
Hyperparameters Searcher always maximizes the score function, so if we need to decrease it, it just adds the minus.
In [9]:
researcher.best_score_
Out[9]:
In [10]:
exctractor = SimpleFeatureExtractor()
clf = Pipeline([
("extractor", exctractor),
("regression", linear_model.ElasticNet()),
])
In [11]:
param_grid = {
"regression__alpha": np.logspace(-3, 2, 10),
"regression__l1_ratio": np.linspace(0, 1, 10)
}
scorerer = make_scorer(rmsle, greater_is_better=False)
researcher = GridSearchCV(clf, param_grid, scoring=scorerer, cv=5, n_jobs=4, verbose=1, refit=False)
researcher.fit(df, df["count"].values)
Out[11]:
In [12]:
researcher.best_score_
Out[12]:
In [13]:
researcher.best_params_
Out[13]:
In [14]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
ohe = OneHotEncoder(categories='auto', sparse=False)
scaler = StandardScaler()
categorical_columns = ["week_day", "hour", "season", "weather"]
numerical_columns = ["temp", "atemp", "humidity", "windspeed"]
def _add_features(self, X):
X["week_day"] = X.datetime.apply(lambda dttm: parse(dttm).weekday())
X["hour"] = X.datetime.apply(lambda dttm: parse(dttm).hour)
def _combine(self, *feature_groups):
return np.hstack(feature_groups)
def collect_stats(self, X):
self._add_features(X)
self.ohe.fit(X[self.categorical_columns])
self.scaler.fit(X[self.numerical_columns])
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
self._add_features(X)
custom_binary_features = self.ohe.transform(X[self.categorical_columns])
scaled_features = self.scaler.transform(X[self.numerical_columns])
return self._combine(
custom_binary_features,
scaled_features,
X[["holiday", "workingday"]].values
)
In [15]:
exctractor = FeatureExtractor()
exctractor.collect_stats(df)
clf = Pipeline([
("extractor", exctractor),
("regression", linear_model.ElasticNet()),
])
In [16]:
param_grid = {
"regression__alpha": np.logspace(-3, 2, 10),
"regression__l1_ratio": np.linspace(0, 1, 10)
}
pd.options.mode.chained_assignment = None
scorerer = make_scorer(rmsle, greater_is_better=False)
researcher = GridSearchCV(clf, param_grid, scoring=scorerer, cv=5, n_jobs=4, verbose=1, refit=False)
researcher.fit(df, df["count"].values)
Out[16]:
In [17]:
researcher.best_score_
Out[17]:
In [18]:
researcher.best_params_
Out[18]:
In [31]:
scorerer = make_scorer(mean_squared_error, greater_is_better=False)
scores = cross_val_score(clf, df, df["count"].values, cv=5, n_jobs=4, scoring=scorerer)
np.mean((-np.array(scores)) ** .5)
Out[31]:
What we can theoretically get if we optimize RMSE
In [34]:
param_grid = {
"regression__alpha": np.logspace(-3, 2, 10),
"regression__l1_ratio": np.linspace(0, 1, 10)
}
pd.options.mode.chained_assignment = None
def rmse(y_true, y_pred):
return mean_squared_error(y_true, y_pred) ** .5
scorerer = make_scorer(rmse, greater_is_better=False)
researcher = GridSearchCV(clf, param_grid, scoring=scorerer, cv=5, n_jobs=4, verbose=1, refit=False)
researcher.fit(df, df["count"].values)
Out[34]:
In [35]:
researcher.best_score_
Out[35]:
In [36]:
researcher.best_params_
Out[36]:
https://en.wikipedia.org/wiki/Poisson_distribution
In probability theory and statistics, the Poisson distribution (French pronunciation: [pwasɔ̃]; in English often rendered /ˈpwɑːsɒn/), named after French mathematician Siméon Denis Poisson, is a discrete probability distribution that expresses the probability of a given number of events occurring in a fixed interval of time or space if these events occur with a known constant rate and independently of the time since the last event.[1] The Poisson distribution can also be used for the number of events in other specified intervals such as distance, area or volume.
The other point of view: we have 200 people with 3% probability to pick up the bike.
What about CLT??? It works when $n \rightarrow \inf$. For poisson distribution there is a special case called De Moivre–Laplace theorem.
The list of different kinds of Generalized Linear Regression methods in sklearn: https://scikit-learn.org/stable/modules/linear_model.html
And there is no Poisson regression there.
So, let's write a probabilistic model for poisson distribution and optimize maximum likelihood.
Hausaufgaben: try to do it.
Hint:
start from the assumption $\hat{y} = \exp{\langle x, \theta \rangle}$ and find the derivative of log-likelihood by $\theta$. It's zero + check the sign of the second derivative.
The conclusion: we can simulate poisson regression with simple wrapper.
Check if we have issues with np.log(y == 0)
In [37]:
df[df["count"] == 0]
Out[37]:
In [38]:
np.log(0)
Out[38]:
In [39]:
class PoissonRegression(linear_model.ElasticNet):
def __init__(self, alpha=1.0, l1_ratio=0.5, fit_intercept=True,
normalize=False, precompute=False, max_iter=1000,
copy_X=True, tol=1e-4, warm_start=False, positive=False,
random_state=None, selection='cyclic'):
super().__init__(alpha, l1_ratio, fit_intercept, normalize, precompute, max_iter,
copy_X, tol, warm_start, positive, random_state, selection)
def fit(self, X, y, *args):
return super().fit(X, np.log(y), *args)
def predict(self, X):
return np.exp(super().predict(X))
In [40]:
exctractor = FeatureExtractor()
exctractor.collect_stats(df)
clf = Pipeline([
("extractor", exctractor),
("regression", PoissonRegression()),
])
param_grid = {
"regression__alpha": np.logspace(-5, 1, 20),
"regression__l1_ratio": np.linspace(0, 1, 10)
}
pd.options.mode.chained_assignment = None
scorerer = make_scorer(rmsle, greater_is_better=False)
researcher = GridSearchCV(clf, param_grid, scoring=scorerer, cv=5, n_jobs=4, verbose=1, refit=False)
researcher.fit(df, df["count"].values)
Out[40]:
In [41]:
researcher.best_params_
Out[41]:
In [42]:
researcher.best_score_
Out[42]:
In terms of MSE the score is worse. But it doesn't mean MSE is the most relevant metric. At least poisson regression never predicts negative values.
In [43]:
scorerer = make_scorer(mean_squared_error, greater_is_better=False)
scores = cross_val_score(clf, df, df["count"].values, cv=5, n_jobs=4, scoring=scorerer)
np.mean((-np.array(scores)) ** .5)
Out[43]:
When you need to predict counts, try to use Poisson Regression. You can get good enough results with experience, but you can't handle on just your skills when face a new type of tasks. More complicated tasks you have less your previous experience can help you.
The key to success is to have good enough education. With education you can do research.
In [52]:
df_test = pd.read_csv("test.csv")
cols = df_test.columns
all_data = pd.concat([df[cols], df_test[cols]])
In [63]:
exctractor = FeatureExtractor()
exctractor.collect_stats(all_data)
clf = Pipeline([
("extractor", exctractor),
("regression", PoissonRegression(alpha=0.001623776739188721, l1_ratio=0.1111111111111111)),
])
clf.fit(df, df["count"].values)
df_test["count"] = clf.predict(df_test)
In [68]:
df_test[["datetime","count"]].set_index("datetime").to_csv("linear.csv")
In [71]:
# !kaggle competitions submit -f linear.csv -m "linear regression" bike-sharing-demand
# score 0.64265
Further steps: use Random Forest Regressor & Catboost to get into top 10%.