In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np
import seaborn as sns
from sklearn.dummy import DummyRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
%matplotlib inline
In [2]:
train = pd.read_csv('bike.csv')
In [3]:
train.info()
There're 10 886 observations and 12 features.
The goal is predict - count Note: count = registered + casual
In [4]:
train['datetime'] = pd.to_datetime( train['datetime'] )
train['day'] = train['datetime'].map(lambda x: x.day)
In [5]:
def assing_test_samples(data, last_training_day=0.3, seed=1):
days = data.day.unique()
np.random.seed(seed)
np.random.shuffle(days)
test_days = days[: int(len(days) * 0.3)]
data['is_test'] = data.day.isin(test_days)
def select_features(data):
columns = data.columns[ (data.dtypes == np.int64) | (data.dtypes == np.float64) | (data.dtypes == np.bool) ].values
return [feat for feat in columns if feat not in ['count', 'casual', 'registered'] and 'log' not in feat ]
def get_X_y(data, target_variable):
features = select_features(data)
X = data[features].values
y = data[target_variable].values
return X,y
def train_test_split(train, target_variable):
df_train = train[train.is_test == False]
df_test = train[train.is_test == True]
X_train, y_train = get_X_y(df_train, target_variable)
X_test, y_test = get_X_y(df_test, target_variable)
return X_train, X_test, y_train, y_test
def fit_and_predict(train, model, target_variable):
X_train, X_test, y_train, y_test = train_test_split(train, target_variable)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return (y_test, y_pred)
def post_pred(y_pred):
y_pred[y_pred < 0] = 0
return y_pred
def rmsle(y_true, y_pred, y_pred_only_positive=True):
if y_pred_only_positive: y_pred = post_pred(y_pred)
diff = np.log(y_pred+1) - np.log(y_true+1)
mean_error = np.square(diff).mean()
return np.sqrt(mean_error)
##########
def count_prediction(train, model, target_variable='count'):
(y_test, y_pred) = fit_and_predict(train, model, target_variable)
if target_variable == 'count_log':
y_test = train[train.is_test == True]['count']
y_pred = np.exp2(y_pred)
return rmsle(y_test, y_pred)
def registered_casual_prediction(train, model):
(_, registered_pred) = fit_and_predict(train, model, 'registered')
(_, casual_pred) = fit_and_predict(train, model, 'casual')
y_test = train[train.is_test == True]['count']
y_pred = registered_pred + casual_pred
return rmsle(y_test, y_pred)
def log_registered_casual_prediction(train, model):
(_, registered_pred) = fit_and_predict(train, model, 'registered_log')
(_, casual_pred) = fit_and_predict(train, model, 'casual_log')
y_test = train[train.is_test == True]['count']
y_pred = (np.exp2(registered_pred) - 1) + (np.exp2(casual_pred) -1)
return rmsle(y_test, y_pred)
##########
def importance_features(model, data):
impdf = []
fscore = model.booster().get_fscore()
maps_name = dict([ ("f{0}".format(i), col) for i, col in enumerate(data.columns)])
for ft, score in fscore.iteritems():
impdf.append({'feature': maps_name[ft], 'importance': score})
impdf = pd.DataFrame(impdf)
impdf = impdf.sort_values(by='importance', ascending=False).reset_index(drop=True)
impdf['importance'] /= impdf['importance'].sum()
impdf.index = impdf['feature']
return impdf
def draw_importance_features(model, train):
impdf = importance_features(model, train)
return impdf.plot(kind='bar', title='Importance Features', figsize=(20, 8))
assing_test_samples(train)
In [6]:
print('dummy', count_prediction(train, DummyRegressor()))
print('xgboost', count_prediction(train, xgb.XGBRegressor()))
In [7]:
def etl_datetime(df):
df['year'] = df['datetime'].map(lambda x: x.year)
df['month'] = df['datetime'].map(lambda x: x.month)
df['hour'] = df['datetime'].map(lambda x: x.hour)
df['minute'] = df['datetime'].map(lambda x: x.minute)
df['dayofweek'] = df['datetime'].map(lambda x: x.dayofweek)
df['weekend'] = df['datetime'].map(lambda x: x.dayofweek in [5,6])
etl_datetime(train)
In [8]:
model = xgb.XGBRegressor()
print('xgboost', count_prediction(train, model))
draw_importance_features(model, train)
Out[8]:
In [9]:
model = xgb.XGBRegressor()
print('xgboost', registered_casual_prediction(train, model))
draw_importance_features(model, train)
Out[9]:
In [10]:
train['{0}_log'.format('count')] = train['count'].map(lambda x: np.log2(x) )
for name in ['registered', 'casual']:
train['{0}_log'.format(name)] = train[name].map(lambda x: np.log2(x+1) )
In [11]:
model = xgb.XGBRegressor()
print('xgboost', count_prediction(train, model, 'count_log'))
draw_importance_features(model, train)
Out[11]:
In [12]:
model = xgb.XGBRegressor()
print('xgboost', log_registered_casual_prediction(train, model))
draw_importance_features(model, train)
Out[12]:
In [13]:
import itertools
def get_num_features(data):
columns = data.loc[ :, (data.dtypes == np.float) | (data.dtypes == np.int64) ].columns.values
return [c for c in columns if 'count' not in c and 'registered' not in c and 'casual' not in c]
def generate_new_features(data):
num_cols = get_num_features(data)
for feat_x, feat_y in itertools.product(num_cols, num_cols):
name_times = '{0}_x_{1}'.format(feat_x, feat_y)
name_plus = '{0}_+_{1}'.format(feat_x, feat_y)
name_diff = '{0}_-_{1}'.format(feat_x, feat_y)
data[name_times] = data[feat_x] * data[feat_y]
data[name_plus] = data[feat_x] + data[feat_y]
data[name_diff] = data[feat_x] - data[feat_y]
train_magic = train.copy()
generate_new_features(train_magic)
model = xgb.XGBRegressor()
print('xgboost', log_registered_casual_prediction(train_magic, model))
draw_importance_features(model, train_magic)
Out[13]:
In [13]:
models = [
('decision_tree', DecisionTreeRegressor()),
##put here other algorithms (mentioned above)
]
for model in models:
print(model[0], log_registered_casual_prediction(train, model[1]))
In [71]:
for max_depth in [2, 5, 10]:
for n_estimators in [100, 200, 300]:
params = {'max_depth': max_depth, 'n_estimators': n_estimators}
model = xgb.XGBRegressor(**params)
print(params, log_registered_casual_prediction(train, model))
let's try play around with subsample, learning_rate and ohers...
In [35]:
params = {
'max_depth': 6,
'n_estimators': 600,
'subsample': 0.95,
'colsample_bytree': 0.3,
'learning_rate': 0.05,
'reg_alpha': 0.1
}
model = xgb.XGBRegressor(**params)
print(params, log_registered_casual_prediction(train_magic, model))
What about feature selection (remove all useless features)? Tips: sklearn.feature_selection.*
In [ ]: