In [16]:
import pandas as pd
pd.options.display.max_columns = 200

import numpy as np
import pickle

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

# увеличим дефолтный размер графиков
from pylab import rcParams
rcParams['figure.figsize'] = 18, 6
rcParams['font.size'] = 16
rcParams['axes.labelsize'] = 14
rcParams['xtick.labelsize'] = 13
rcParams['ytick.labelsize'] = 13

Данные

Зайдите на https://www.drivendata.org/ и зарегистрируйтесь. Для сегодняшней домашки будем данные брать именно отсюда.

Нас интересует конкурс https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/page/23/ . В нем представлены данные, собранные Taarifa и Танзанийским Министерством Воды и Ирригации.

Постановка задачи следующая: На территории Танзании установлено множество водяных насосов, которые спасают местное население от жажды. В зависимости от того, кем и когда установлен насос, а также зная, как им распоряжаются, можно попытаться предположить, какие из них функционируют, какие нуждаются в ремонте и какие не работают вовсе.

Этим мы и займемся, а заодно и прокачаемся в подборе гиперпараметров алгоритмов.


In [17]:
train_X, train_y = pd.read_csv( # путь к вашему файлу train.csv
    'data/WaterTable/train.csv'
), pd.read_csv( # путь к вашему файлу trainLabels.csv
    'data/WaterTable/trainLabels.csv'
)
df = pd.merge(train_X, train_y, how='left')

df_test = pd.read_csv( # путь к вашему файлу test.csv
    'data/WaterTable/test.csv'
)

Предобработка


In [18]:
def reduce_factor_levels(df, column_name, limit=None, top=None, name=None):
    assert(limit is not None or top is not None), 'Specify limit ot top'
    if top is None:
        top = df[column_name].value_counts()[:limit].index
    if name is None:
        name = '%s_OTHER' % column_name
    df.loc[~df[column_name].isin(top), column_name] = name
    return top

In [19]:
top = reduce_factor_levels(df, 'funder', 10)
reduce_factor_levels(df_test, 'funder', top=top);

top = reduce_factor_levels(df, 'installer', 10)
reduce_factor_levels(df_test, 'installer', top=top);

drop = ['wpt_name', 'num_private', 'subvillage', 'region_code', 'district_code', 'lga', 'ward', 'recorded_by', 'scheme_name']

df.drop(drop, axis=1, inplace=True)
df_test.drop(drop, axis=1, inplace=True)

df.loc[df.scheme_management == 'None', 'scheme_management'] = ''
df.loc[df.scheme_management.isnull(), 'scheme_management'] = ''

df_test.loc[df_test.scheme_management.isnull(), 'scheme_management'] = ''

df['construction_date_known'] = (df.construction_year > 0).astype(np.int32)
df_test['construction_date_known'] = (df_test.construction_year > 0).astype(np.int32)

min_year = df[df.construction_year > 0].construction_year.min() // 10 - 1

df['construction_decade'] = df.construction_year // 10 - min_year
df_test['construction_decade'] = df_test.construction_year // 10 - min_year

df.loc[df.construction_decade < 0, 'construction_decade'] = 0
df_test.loc[df_test.construction_decade < 0, 'construction_decade'] = 0

top = reduce_factor_levels(df, 'construction_year', 20)
reduce_factor_levels(df_test, 'construction_year', top=top);

df.loc[df.extraction_type == 'other - mkulima/shinyanga', 'extraction_type'] = 'other'

heights = np.arange(-1, df.gps_height.max()+500, 500)
height_labels = list(range(len(heights)-1))

df['gps_height_rounded'] = pd.cut(df.gps_height, bins=heights, labels=height_labels)
df_test['gps_height_rounded'] = pd.cut(df_test.gps_height, bins=heights, labels=height_labels)

df.drop(['gps_height'], axis=1, inplace=True)
df_test.drop(['gps_height'], axis=1, inplace=True)

pops = np.arange(-1, df.population.max()+500, 500)
pops_labels = list(range(len(pops)-1))

df['pop_rounded'] = pd.cut(df.population, bins=pops, labels=pops_labels)
df_test['pop_rounded'] = pd.cut(df_test.population, bins=pops, labels=pops_labels)

df.drop(['population'], axis=1, inplace=True)
df_test.drop(['population'], axis=1, inplace=True)

df.drop(['date_recorded'], axis=1, inplace=True)
df_test.drop(['date_recorded'], axis=1, inplace=True)

df.public_meeting.fillna(True, inplace=True)
df_test.public_meeting.fillna(True, inplace=True)

df.permit.fillna(True, inplace=True)
df_test.permit.fillna(True, inplace=True)

df.gps_height_rounded.fillna(0, inplace=True)
df_test.gps_height_rounded.fillna(0, inplace=True)

X, y, X_test = df.drop(['id', 'status_group'], axis=1), \
               df.status_group, \
               df_test.drop(['id'], axis=1)
        
def prepare(X_train, X_test):
    from sklearn.preprocessing import StandardScaler
    from sklearn.feature_extraction import DictVectorizer
    
    objects = X_train.select_dtypes(include=['O']).columns.values
    numeric = X_train.select_dtypes(exclude=['O']).columns.values
    
    dv = DictVectorizer(sparse=False)
    data_encoded_tr = dv.fit_transform(X_train[objects].to_dict(orient='records'))
    data_encoded_ts = dv.transform(X_test[objects].to_dict(orient='records'))

    ss = StandardScaler()
    data_scaled_tr = ss.fit_transform(X_train[numeric])
    data_scaled_ts = ss.transform(X_test[numeric])
    
    train = np.hstack((data_encoded_tr, data_scaled_tr))
    test  = np.hstack((data_encoded_ts, data_scaled_ts))
    return train, test

x_train, x_test = prepare(X, X_test)

from sklearn.preprocessing import LabelEncoder
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y)

Задание

Применив ансамблирование, стекинг и блэндинг улучшить вчерашние результаты

Вчерашние результаты для устрашения


In [24]:
from sklearn.model_selection import cross_val_score

In [25]:
from lightgbm import LGBMClassifier

clf2 = LGBMClassifier(max_bin=460,learning_rate=0.11,n_estimators=140,num_leaves=130)
 
scores = cross_val_score(clf2, x_train, y)
np.mean(scores), 2*np.std(scores)


Out[25]:
(0.80228956228956239, 0.0032337258460721531)

In [26]:
clf2 = clf2.fit(x_train, y)

In [27]:
y_te = clf2.predict(x_test)
y_te


Out[27]:
array([2, 0, 0, ..., 0, 0, 2], dtype=int64)

In [28]:
ans_nn = pd.DataFrame({'id': df_test['id'], 'status_group': y_encoder.inverse_transform(y_te)})
ans_nn.head()


Out[28]:
id status_group
0 50785 non functional
1 51630 functional
2 17168 functional
3 45559 non functional
4 49871 functional

In [29]:
#ans_nn.to_csv('ans_lightgbm.csv', index=False)

Опробуем ансамблирование


In [4]:
from sklearn.pipeline import make_pipeline
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [5]:
clfs = [make_pipeline(PolynomialFeatures(degree=2), LogisticRegression()), 
        LGBMClassifier(), 
        DecisionTreeClassifier()]

In [18]:
eclf = EnsembleVoteClassifier(clfs=clfs, voting='soft', weights=[1,2,1])

In [19]:
scores = cross_val_score(eclf, x_train, y)
np.mean(scores), 2*np.std(scores)


Out[19]:
(0.75796296296296306, 0.0021096902623327272)

Out: спустя 4 часа работы fit висит в памяти и ничего не выдает. При выполнении mean(scores)=0.76407, тут результат для 'hard' без весов.


In [ ]:
#eclf = eclf.fit(x_train, y)

In [ ]:
#y_te = eclf.predict(x_test)
#y_te

In [ ]:
#ans_nn = pd.DataFrame({'id': df_test['id'], 'status_group': y_encoder.inverse_transform(y_te)})
#ans_nn.head()

In [ ]:
#ans_nn.to_csv('ans_ansambl.csv', index=False)

Опробуем Блэндинг


In [20]:
import logging
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
#from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline

np.set_printoptions(precision=6)
np.set_printoptions(suppress=True)

np.random.seed(1000)

In [25]:
# create dataset
dataset = Dataset(x_train, y, x_test)

# initialize RandomForest & LinearRegression 
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')

In [26]:
# Stack two models 
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.stack(k=10,seed=111)

In [27]:
# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()

In [29]:
results


Out[29]:
array([ 0.488933,  0.620063,  0.493418, ...,  0.428759,  0.223097,
        1.957745])

In [35]:
# Use randomly sampled 20% of the data as a holdout dataset
res = model_lr.validate(mean_absolute_error, test_size=0.20)


Metric: mean_absolute_error
Accuracy: 0.639200579197

Опробуем стекинг


In [12]:
from mlxtend.classifier import StackingCVClassifier, StackingClassifier
from sklearn.ensemble import RandomForestClassifier

In [13]:
sclf = StackingCVClassifier(clfs, RandomForestClassifier())

In [ ]:
scores = cross_val_score(sclf, x_train, y)
np.mean(scores), 2*np.std(scores)


Out[ ]:
(0.75860269360269361, 0.002571738242953299)

out: Windows BlueScreen with "unexpected store exception" спустя 2 часа работы + pyhon.exe not responding error


In [ ]:
#sclf = sclf.fit(x_train, y)

In [ ]:
#y_te = sclf.predict(x_test)
#y_te

In [ ]:
#ans_nn = pd.DataFrame({'id': df_test['id'], 'status_group': y_encoder.inverse_transform(y_te)})
#ans_nn.head()

In [ ]:
#ans_nn.to_csv('ans_stack.csv', index=False)