In [114]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ggplot import *
import brewer2mpl
import dateutil
from random import shuffle

%matplotlib inline



In [115]:

    
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import make_scorer        
from sklearn.grid_search import ParameterGrid

1. Understand Data



In [116]:

    
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')



In [117]:

    
train.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 10886 entries, 0 to 10885
Data columns (total 12 columns):
datetime      10886 non-null object
season        10886 non-null int64
holiday       10886 non-null int64
workingday    10886 non-null int64
weather       10886 non-null int64
temp          10886 non-null float64
atemp         10886 non-null float64
humidity      10886 non-null int64
windspeed     10886 non-null float64
casual        10886 non-null int64
registered    10886 non-null int64
count         10886 non-null int64
dtypes: float64(3), int64(8), object(1)
memory usage: 1.1+ MB

1.1 Conclusion

We have 10 886 observations and 12 features.
We don't have missing value.
Most value is integer, few of them float and object (should be a date).

Let's check more detail each features.



In [118]:

    
train.describe()









    Out[118]:






  
    
      
      season
      holiday
      workingday
      weather
      temp
      atemp
      humidity
      windspeed
      casual
      registered
      count
    
  
  
    
      count
      10886.000000
      10886.000000
      10886.000000
      10886.000000
      10886.00000
      10886.000000
      10886.000000
      10886.000000
      10886.000000
      10886.000000
      10886.000000
    
    
      mean
      2.506614
      0.028569
      0.680875
      1.418427
      20.23086
      23.655084
      61.886460
      12.799395
      36.021955
      155.552177
      191.574132
    
    
      std
      1.116174
      0.166599
      0.466159
      0.633839
      7.79159
      8.474601
      19.245033
      8.164537
      49.960477
      151.039033
      181.144454
    
    
      min
      1.000000
      0.000000
      0.000000
      1.000000
      0.82000
      0.760000
      0.000000
      0.000000
      0.000000
      0.000000
      1.000000
    
    
      25%
      2.000000
      0.000000
      0.000000
      1.000000
      13.94000
      16.665000
      47.000000
      7.001500
      4.000000
      36.000000
      42.000000
    
    
      50%
      3.000000
      0.000000
      1.000000
      1.000000
      20.50000
      24.240000
      62.000000
      12.998000
      17.000000
      118.000000
      145.000000
    
    
      75%
      4.000000
      0.000000
      1.000000
      2.000000
      26.24000
      31.060000
      77.000000
      16.997900
      49.000000
      222.000000
      284.000000
    
    
      max
      4.000000
      1.000000
      1.000000
      4.000000
      41.00000
      45.455000
      100.000000
      56.996900
      367.000000
      886.000000
      977.000000

1.2 Conlusion

count is target value and it changes between from 1 to 977.
holiday, workingday is a binary variables (0 or 1).
season, weather - categorical variable (1,2,3 or 4).
registered and casual there's only in training set.
rest features is numerical variables.

Let's try to visualise distribution each features.



In [119]:

    
train.hist(figsize=(16,12),bins=30)
plt.show()

1.3 conclusion

atemp have some spikes on 10,20,30 (more rounded number).
count the most value is in the first bucket and have big tail on the second right. We can try transform using log (it can be helpful).
weather looks like corelated to count of rows.
season all buckets look comparatively equal.
windspeed there're some missing buckets between 0 and 10, and 19. Maybe it's some problem with data (human error).

Let's try transform count using log.



In [120]:

    
train['count_log'] = train['count'].map(lambda x: np.log2(x))
train.count_log.hist(bins=15)









    Out[120]:





<matplotlib.axes._subplots.AxesSubplot at 0x1125bb210>

2. Prepare Data

Correlation matrix
Importance features
Feature engineering
Feature selection

2.1 Correlation matrix

Let's to see correlation matrix



In [121]:

    
f, ax = plt.subplots(figsize=(20,20))
sns.corrplot(train, sig_stars=False, ax=ax)









    Out[121]:





<matplotlib.axes._subplots.AxesSubplot at 0x115216ad0>

count vs registered, casual (0.97, 0.69) correlated (it's not big surprise).
count vs temp, atemp also correlated, but less (0.39, 0.39).
count vs humidity, weather have inverse correlation (-0.32, -0.13).

2.2 Importances features



In [122]:

    
def select_features(data):
    return [feat for feat in data.columns if feat not in ['count_log', 'count', 'casual', 'registered', 'datetime']]

def get_X_y(data, cols=None):
    if not cols:
        cols = select_features(data)
        
    X = data[cols].values
    y = data['count'].values
    
    return X,y

def get_importance__features(data, model=RandomForestRegressor(), limit=20):
    X,y = get_X_y(data)
    cols = select_features(data)
    
    model.fit(X, y)
    feats = pd.DataFrame(model.feature_importances_, index=data[cols].columns)
    feats = feats.sort([0], ascending=False) [:limit]
    return feats.rename(columns={0:'name'})
    
def draw_importance_features(data, model=RandomForestRegressor(), limit=20):
    feats = get_importance__features(data, model, limit)
    feats.plot(kind='bar')


draw_importance_features(train)

2.3 Feature enginnering

Let's try figure out new feature based on exists. Feature datetime can be split into few part: year, month, day, hour, minute, dayofweek, weekend.



In [123]:

    
def temp_cat(temp):
    if temp < 15: return 1
    if temp < 25: return 2
    return 3

def cat_hour(hour):
    if 5 >= hour < 10:
        return 1#morning
    elif 10 >= hour < 17:
        return 2#day
    elif 17 >= hour < 23:
        return 3 #evening
    else:
        return 4 #night

def etl(df):
    df['datetime'] = pd.to_datetime( df['datetime'] )
    
    #time
    df['year'] = df['datetime'].map(lambda x: x.year)
    df['month'] = df['datetime'].map(lambda x: x.month)
    df['day'] = df['datetime'].map(lambda x: x.day)
    df['hour'] = df['datetime'].map(lambda x: x.hour)
    df['minute'] = df['datetime'].map(lambda x: x.minute)
    df['dayofweek'] = df['datetime'].map(lambda x: x.dayofweek)
    df['weekend'] = df['datetime'].map(lambda x: x.dayofweek in [5,6])
    df['time_of_day'] = df['hour'].map(cat_hour)
    
    #temp
    df['temp_cat'] = df['temp'].map(temp_cat)
    df['temp_hour'] = df.apply(lambda x: x['temp'] * x['hour'], axis=1)
    
    #season 
    df['season_weather'] = df.apply(lambda x: x['season'] * x['weather'], axis=1)
    df['season_temp'] = df.apply(lambda x: x['season'] * x['temp'], axis=1)
    df['season_atemp'] = df.apply(lambda x: x['season'] * x['atemp'], axis=1)
    df['season_hour'] = df.apply(lambda x: x['season'] * x['hour'], axis=1)
    
    #squared
    df['temp2'] = df['temp'].map(lambda x: x**2)
    df['humidity2'] = df['humidity'].map(lambda x: x**2)
    df['weather2'] = df['weather'].map(lambda x: x**2)
    

etl(train)
etl(test)

Let's see how changed importances features after adding new one.



In [124]:

    
print draw_importance_features(train)









    



None

hour - looks like very importance feature.
new generated features: seasson_attemp, temp2, dayofwork, humidity2 - there're on top 10.

Let's check distrubution count of rent by hour in working day and non working day.



In [125]:

    
by_hour = train.groupby(['hour', 'workingday'])['count'].agg('sum').unstack()
by_hour.plot(kind='bar', figsize=(15,5), width=0.9)









    Out[125]:





<matplotlib.axes._subplots.AxesSubplot at 0x113823750>

On working day we have few spikes at 7,8,9 (going to work) and 17,18,19 (return to home).
On non working day looks more smoothing (without any spikes).
In the middle day (from 10 to 15) count of rent looks simmilar.
On working day count of rent greater than non-working day.
A little greater count of rent at the night in non-working day.

Let's see how change range count of rent by hour.



In [126]:

    
def plot_hours(data, message = ''):
    hours = {}
    for hour in range(24):
        hours[hour] = data[ data.hour == hour ]['count'].values

    plt.figure(figsize=(20,10))
    plt.ylabel("Count rent")
    plt.xlabel("Hours")
    plt.title("count vs hours\n" + message)
    plt.boxplot( [hours[hour] for hour in range(24)] )
    
    axis = plt.gca()
    axis.set_ylim([1, 1100])
    

plot_hours( train[train.workingday == 1], 'working day')
plot_hours( train[train.workingday == 0], 'non working day')

The biggest variance for working day is at 8,9 and 18,19,20.
The biggest variance for non-working day is from 12 to 20.

Why there's to big range at 9am on working day?



In [127]:

    
t9 = train[ (train.hour == 9) & (train.workingday == 1) ].reset_index()
ggplot(aes(x='count', y='month', color='year'), t9) + geom_point()\
    + ggtitle('Count of rent by month (and years) at 9am')









    












    Out[127]:





<ggplot: (287746573)>

There's some wave pattern (from Jan to Aug increased, then decreased).
More bikes rent in 2012 (blue points) rather than 2011 (red points).
Ther're some outliers (e.g. Mar, Apr, May 2011 - red points).

Let's plot by years.



In [128]:

    
def plot_by_years(data, title):
    values = [ data[data.year == 2011]['count'].values, data[data.year == 2012]['count'].values]
    print np.median(values[0]), np.median(values[1])
    plt.figure(figsize=(15,5))
    plt.boxplot(values)
    plt.xlabel("Years")
    plt.ylabel("Count of rent")
    plt.title(title)
    _ = plt.xticks([1, 2], [2011, 2012])
    

plot_by_years(train, 'Count of rent by year')
plot_by_years(t9, 'Count of rent by year at 9am')









    



111.0 199.0
188.0 319.0

In general count of rent in 2012 increased, but median looks simillar (111 and 199).
At 9am median for 2011 is equal to 188, but for 2012 is 319. It looks like count of rent increased twice in 2012.

2.4 Feature selection



In [194]:

    
def get_cols():
    feats = get_importance__features(train, limit=50)
    return [feat for feat in feats[ feats.name > 0 ].index if feat  not in ['is_test']]

3. Modeling and Evaluation

Preapre quality function
Prepare training set and validate set
Single variable model (base line)
Using more advanced models
Tuning hyperparameters
Detect problem in actual models

3.1 Quality function

$$ \sqrt{\frac{1}{n} \sum_{i=1}^n (\log(p_i + 1) - \log(a_i+1))^2 }$$

where
n is the number of hours in the test set
pi is your predicted count
ai is the actual count
log(x) is the natural logarithm



In [130]:

    
def rmsle(y_true, y_pred):
    diff = np.log(y_pred + 1) - np.log(y_true + 1)
    mean_error = np.square(diff).mean()
    return np.sqrt(mean_error)

scorer = make_scorer(rmsle, greater_is_better=False)

3.2 Train and test sets

Let's check distribution by days in train set.



In [131]:

    
train.day.unique()









    Out[131]:





array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])

So we have from 1 to 19. We can try split this from 1-14 (2 weeks) like train set and 15-19 like a test set.

Note: this way have some problem, but looks good enough for starting



In [132]:

    
def train_test_split(data, last_training_day=0.3):
    days = train.day.unique()
    shuffle(days)
    test_days = days[: len(days) * 0.3]
    
    data['is_test'] = data.day.isin(test_days)
    df_train = data[data.is_test == False] 
    df_test  = data[data.is_test == True]
    
    return df_train, df_test

3.3 Single variable model



In [133]:

    
df_train, df_test = train_test_split(train)

mean_model = df_train.groupby('hour').mean()['count'].reset_index().to_dict()['count']
median_model = df_train.groupby('hour').median()['count'].reset_index().to_dict()['count']

y_true   = df_test['count'].values
y_mean   = df_test['hour'].map(lambda hour:  int(mean_model[hour]) ).values
y_median = df_test['hour'].map(lambda hour:  int(median_model[hour]) ).values

print 'mean', rmsle(y_true, y_mean)
print 'median', rmsle(y_true, y_median)









    



mean 0.79705817138
median 0.767721019165

Median looks a little better. Let's try to build more advanced model.

3.4 More advanced models



In [136]:

    
cols = get_cols()
model = RandomForestRegressor()
df_train, df_test = train_test_split(train)

def get_y_pred(model, cols):
    X_train = df_train[ cols ].values
    y_train = df_train[ 'count' ].values
    X_test  = df_test[ cols ].values

    model.fit(X_train, y_train)
    return model.predict(X_test)

def predict(model, cols):
    y_true  = df_test['count'].values
    y_pred = get_y_pred(model, cols)
    return rmsle(y_true, y_pred)


def models():
    yield ExtraTreesRegressor()
    yield RandomForestRegressor()
    yield AdaBoostRegressor()
    yield BaggingRegressor()
    yield DecisionTreeRegressor()

results = {}
for model in models():
    results[ predict(model, cols) ] = model



In [137]:

    
for key in sorted(results.keys()):
    print key, results[key]









    



0.372899552074 BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0)
0.375047901919 ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)
0.385379868995 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
0.450158179177 DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, random_state=None,
           splitter='best')
1.07310181884 AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
         n_estimators=50, random_state=None)

The best results

RandomForestRegressor
BaggingRegressor
ExtraTreesRegressor

Improving twice result comparing to single model variable (~0.37 compare to ~0.77).

3.5 Tuning model

Let's try tuning two models:

RandomForestRegressor
BaggingRegressor
ExtraTreesRegressor



In [75]:

    
rf_params = {
    'n_estimators': [100, 200], 
    'min_samples_split': [5, 15, 20], 
    'n_jobs': [-1],
    'min_samples_leaf': [1, 2]
}

bagging_params = {
    'n_estimators': [100, 150, 170],
    'n_jobs': [-1],
}

extratree_params = {
    'n_estimators': [50, 100, 150],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [2, 3, 7],
    'n_jobs': [-1],
}

models = [
    (rf_params, RandomForestRegressor()), 
    (bagging_params, BaggingRegressor()),
    (extratree_params, ExtraTreesRegressor())
]

results = {}
for params_model, model in models:
    key_model = str(model)
    results[key_model] = []
    
    for params in ParameterGrid(params_model):
        model.set_params(**params)
        results[key_model].append( (predict(model, cols), params) )
        
for key in results.keys():
    print key, sorted(results[key], key=lambda x: x[0])[0]









    



ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
          verbose=0, warm_start=False) (0.3458557191839145, {'min_samples_split': 3, 'n_estimators': 150, 'n_jobs': -1, 'min_samples_leaf': 1})
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False) (0.3520417798618547, {'min_samples_split': 5, 'n_estimators': 200, 'n_jobs': -1, 'min_samples_leaf': 2})
BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0) (0.35212817547917902, {'n_estimators': 170, 'n_jobs': -1})

ExtraTreesRegressor with params {'min_samples_split': 3, 'n_estimators': 150, 'min_samples_leaf': 1}
RandomForestRegressor with params {'min_samples_split': 5, 'n_estimators': 200, 'min_samples_leaf': 2}
BaggingRegressor with params {'n_estimators': 170}

Use count_log for prediction



In [211]:

    
def log_predict(df_train, df_test, is_traininig=True):
    cols = get_cols()

    X_train = df_train[cols].values
    y_train = df_train['count_log'].values
    X_test  = df_test[cols].values
    if is_traininig:
        y_true  = df_test['count'].values

    model = ExtraTreesRegressor(**{'min_samples_split': 3, 'n_estimators': 150, 'min_weight_fraction_leaf': 0.1, 'min_samples_leaf': 2})
    model.fit(X_train, y_train)
    y_log_pred = model.predict(X_test)
    y_pred = np.exp2( y_log_pred ).astype(int)
    
    if is_traininig:
        return rmsle(y_true, y_pred)
    else:
        return y_pred
    
print log_predict(df_train, df_test)









    



0.339510634232

Predict value on test set



In [212]:

    
test['count'] = log_predict(train, test, is_traininig=False)
test[ ['datetime', 'count'] ].to_csv('result1.csv', index=False)

Kaggle score for this solution is 0.41740.



In [ ]:

	season	holiday	workingday	weather	temp	atemp	humidity	windspeed	casual	registered	count
count	10886.000000	10886.000000	10886.000000	10886.000000	10886.00000	10886.000000	10886.000000	10886.000000	10886.000000	10886.000000	10886.000000
mean	2.506614	0.028569	0.680875	1.418427	20.23086	23.655084	61.886460	12.799395	36.021955	155.552177	191.574132
std	1.116174	0.166599	0.466159	0.633839	7.79159	8.474601	19.245033	8.164537	49.960477	151.039033	181.144454
min	1.000000	0.000000	0.000000	1.000000	0.82000	0.760000	0.000000	0.000000	0.000000	0.000000	1.000000
25%	2.000000	0.000000	0.000000	1.000000	13.94000	16.665000	47.000000	7.001500	4.000000	36.000000	42.000000
50%	3.000000	0.000000	1.000000	1.000000	20.50000	24.240000	62.000000	12.998000	17.000000	118.000000	145.000000
75%	4.000000	0.000000	1.000000	2.000000	26.24000	31.060000	77.000000	16.997900	49.000000	222.000000	284.000000
max	4.000000	1.000000	1.000000	4.000000	41.00000	45.455000	100.000000	56.996900	367.000000	886.000000	977.000000