In [114]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ggplot import *
import brewer2mpl
import dateutil
from random import shuffle
%matplotlib inline
In [115]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.grid_search import ParameterGrid
In [116]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [117]:
train.info()
In [118]:
train.describe()
Out[118]:
Let's try to visualise distribution each features.
In [119]:
train.hist(figsize=(16,12),bins=30)
plt.show()
Let's try transform count using log.
In [120]:
train['count_log'] = train['count'].map(lambda x: np.log2(x))
train.count_log.hist(bins=15)
Out[120]:
In [121]:
f, ax = plt.subplots(figsize=(20,20))
sns.corrplot(train, sig_stars=False, ax=ax)
Out[121]:
In [122]:
def select_features(data):
return [feat for feat in data.columns if feat not in ['count_log', 'count', 'casual', 'registered', 'datetime']]
def get_X_y(data, cols=None):
if not cols:
cols = select_features(data)
X = data[cols].values
y = data['count'].values
return X,y
def get_importance__features(data, model=RandomForestRegressor(), limit=20):
X,y = get_X_y(data)
cols = select_features(data)
model.fit(X, y)
feats = pd.DataFrame(model.feature_importances_, index=data[cols].columns)
feats = feats.sort([0], ascending=False) [:limit]
return feats.rename(columns={0:'name'})
def draw_importance_features(data, model=RandomForestRegressor(), limit=20):
feats = get_importance__features(data, model, limit)
feats.plot(kind='bar')
draw_importance_features(train)
In [123]:
def temp_cat(temp):
if temp < 15: return 1
if temp < 25: return 2
return 3
def cat_hour(hour):
if 5 >= hour < 10:
return 1#morning
elif 10 >= hour < 17:
return 2#day
elif 17 >= hour < 23:
return 3 #evening
else:
return 4 #night
def etl(df):
df['datetime'] = pd.to_datetime( df['datetime'] )
#time
df['year'] = df['datetime'].map(lambda x: x.year)
df['month'] = df['datetime'].map(lambda x: x.month)
df['day'] = df['datetime'].map(lambda x: x.day)
df['hour'] = df['datetime'].map(lambda x: x.hour)
df['minute'] = df['datetime'].map(lambda x: x.minute)
df['dayofweek'] = df['datetime'].map(lambda x: x.dayofweek)
df['weekend'] = df['datetime'].map(lambda x: x.dayofweek in [5,6])
df['time_of_day'] = df['hour'].map(cat_hour)
#temp
df['temp_cat'] = df['temp'].map(temp_cat)
df['temp_hour'] = df.apply(lambda x: x['temp'] * x['hour'], axis=1)
#season
df['season_weather'] = df.apply(lambda x: x['season'] * x['weather'], axis=1)
df['season_temp'] = df.apply(lambda x: x['season'] * x['temp'], axis=1)
df['season_atemp'] = df.apply(lambda x: x['season'] * x['atemp'], axis=1)
df['season_hour'] = df.apply(lambda x: x['season'] * x['hour'], axis=1)
#squared
df['temp2'] = df['temp'].map(lambda x: x**2)
df['humidity2'] = df['humidity'].map(lambda x: x**2)
df['weather2'] = df['weather'].map(lambda x: x**2)
etl(train)
etl(test)
Let's see how changed importances features after adding new one.
In [124]:
print draw_importance_features(train)
Let's check distrubution count of rent by hour in working day and non working day.
In [125]:
by_hour = train.groupby(['hour', 'workingday'])['count'].agg('sum').unstack()
by_hour.plot(kind='bar', figsize=(15,5), width=0.9)
Out[125]:
Let's see how change range count of rent by hour.
In [126]:
def plot_hours(data, message = ''):
hours = {}
for hour in range(24):
hours[hour] = data[ data.hour == hour ]['count'].values
plt.figure(figsize=(20,10))
plt.ylabel("Count rent")
plt.xlabel("Hours")
plt.title("count vs hours\n" + message)
plt.boxplot( [hours[hour] for hour in range(24)] )
axis = plt.gca()
axis.set_ylim([1, 1100])
plot_hours( train[train.workingday == 1], 'working day')
plot_hours( train[train.workingday == 0], 'non working day')
Why there's to big range at 9am on working day?
In [127]:
t9 = train[ (train.hour == 9) & (train.workingday == 1) ].reset_index()
ggplot(aes(x='count', y='month', color='year'), t9) + geom_point()\
+ ggtitle('Count of rent by month (and years) at 9am')
Out[127]:
Let's plot by years.
In [128]:
def plot_by_years(data, title):
values = [ data[data.year == 2011]['count'].values, data[data.year == 2012]['count'].values]
print np.median(values[0]), np.median(values[1])
plt.figure(figsize=(15,5))
plt.boxplot(values)
plt.xlabel("Years")
plt.ylabel("Count of rent")
plt.title(title)
_ = plt.xticks([1, 2], [2011, 2012])
plot_by_years(train, 'Count of rent by year')
plot_by_years(t9, 'Count of rent by year at 9am')
In [194]:
def get_cols():
feats = get_importance__features(train, limit=50)
return [feat for feat in feats[ feats.name > 0 ].index if feat not in ['is_test']]
In [130]:
def rmsle(y_true, y_pred):
diff = np.log(y_pred + 1) - np.log(y_true + 1)
mean_error = np.square(diff).mean()
return np.sqrt(mean_error)
scorer = make_scorer(rmsle, greater_is_better=False)
In [131]:
train.day.unique()
Out[131]:
So we have from 1 to 19. We can try split this from 1-14 (2 weeks) like train set and 15-19 like a test set.
Note: this way have some problem, but looks good enough for starting
In [132]:
def train_test_split(data, last_training_day=0.3):
days = train.day.unique()
shuffle(days)
test_days = days[: len(days) * 0.3]
data['is_test'] = data.day.isin(test_days)
df_train = data[data.is_test == False]
df_test = data[data.is_test == True]
return df_train, df_test
In [133]:
df_train, df_test = train_test_split(train)
mean_model = df_train.groupby('hour').mean()['count'].reset_index().to_dict()['count']
median_model = df_train.groupby('hour').median()['count'].reset_index().to_dict()['count']
y_true = df_test['count'].values
y_mean = df_test['hour'].map(lambda hour: int(mean_model[hour]) ).values
y_median = df_test['hour'].map(lambda hour: int(median_model[hour]) ).values
print 'mean', rmsle(y_true, y_mean)
print 'median', rmsle(y_true, y_median)
Median looks a little better. Let's try to build more advanced model.
In [136]:
cols = get_cols()
model = RandomForestRegressor()
df_train, df_test = train_test_split(train)
def get_y_pred(model, cols):
X_train = df_train[ cols ].values
y_train = df_train[ 'count' ].values
X_test = df_test[ cols ].values
model.fit(X_train, y_train)
return model.predict(X_test)
def predict(model, cols):
y_true = df_test['count'].values
y_pred = get_y_pred(model, cols)
return rmsle(y_true, y_pred)
def models():
yield ExtraTreesRegressor()
yield RandomForestRegressor()
yield AdaBoostRegressor()
yield BaggingRegressor()
yield DecisionTreeRegressor()
results = {}
for model in models():
results[ predict(model, cols) ] = model
In [137]:
for key in sorted(results.keys()):
print key, results[key]
In [75]:
rf_params = {
'n_estimators': [100, 200],
'min_samples_split': [5, 15, 20],
'n_jobs': [-1],
'min_samples_leaf': [1, 2]
}
bagging_params = {
'n_estimators': [100, 150, 170],
'n_jobs': [-1],
}
extratree_params = {
'n_estimators': [50, 100, 150],
'min_samples_leaf': [1, 2],
'min_samples_split': [2, 3, 7],
'n_jobs': [-1],
}
models = [
(rf_params, RandomForestRegressor()),
(bagging_params, BaggingRegressor()),
(extratree_params, ExtraTreesRegressor())
]
results = {}
for params_model, model in models:
key_model = str(model)
results[key_model] = []
for params in ParameterGrid(params_model):
model.set_params(**params)
results[key_model].append( (predict(model, cols), params) )
for key in results.keys():
print key, sorted(results[key], key=lambda x: x[0])[0]
{'min_samples_split': 3, 'n_estimators': 150, 'min_samples_leaf': 1}
{'min_samples_split': 5, 'n_estimators': 200, 'min_samples_leaf': 2}
{'n_estimators': 170}
In [211]:
def log_predict(df_train, df_test, is_traininig=True):
cols = get_cols()
X_train = df_train[cols].values
y_train = df_train['count_log'].values
X_test = df_test[cols].values
if is_traininig:
y_true = df_test['count'].values
model = ExtraTreesRegressor(**{'min_samples_split': 3, 'n_estimators': 150, 'min_weight_fraction_leaf': 0.1, 'min_samples_leaf': 2})
model.fit(X_train, y_train)
y_log_pred = model.predict(X_test)
y_pred = np.exp2( y_log_pred ).astype(int)
if is_traininig:
return rmsle(y_true, y_pred)
else:
return y_pred
print log_predict(df_train, df_test)
In [212]:
test['count'] = log_predict(train, test, is_traininig=False)
test[ ['datetime', 'count'] ].to_csv('result1.csv', index=False)
In [ ]: