In [16]:
import pandas as pd
pd.options.display.max_columns = 200
import numpy as np
import pickle
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
# увеличим дефолтный размер графиков
from pylab import rcParams
rcParams['figure.figsize'] = 18, 6
rcParams['font.size'] = 16
rcParams['axes.labelsize'] = 14
rcParams['xtick.labelsize'] = 13
rcParams['ytick.labelsize'] = 13
Зайдите на https://www.drivendata.org/ и зарегистрируйтесь. Для сегодняшней домашки будем данные брать именно отсюда.
Нас интересует конкурс https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/page/23/ . В нем представлены данные, собранные Taarifa и Танзанийским Министерством Воды и Ирригации.
Постановка задачи следующая: На территории Танзании установлено множество водяных насосов, которые спасают местное население от жажды. В зависимости от того, кем и когда установлен насос, а также зная, как им распоряжаются, можно попытаться предположить, какие из них функционируют, какие нуждаются в ремонте и какие не работают вовсе.
Этим мы и займемся, а заодно и прокачаемся в подборе гиперпараметров алгоритмов.
In [17]:
train_X, train_y = pd.read_csv( # путь к вашему файлу train.csv
'data/WaterTable/train.csv'
), pd.read_csv( # путь к вашему файлу trainLabels.csv
'data/WaterTable/trainLabels.csv'
)
df = pd.merge(train_X, train_y, how='left')
df_test = pd.read_csv( # путь к вашему файлу test.csv
'data/WaterTable/test.csv'
)
In [18]:
def reduce_factor_levels(df, column_name, limit=None, top=None, name=None):
assert(limit is not None or top is not None), 'Specify limit ot top'
if top is None:
top = df[column_name].value_counts()[:limit].index
if name is None:
name = '%s_OTHER' % column_name
df.loc[~df[column_name].isin(top), column_name] = name
return top
In [19]:
top = reduce_factor_levels(df, 'funder', 10)
reduce_factor_levels(df_test, 'funder', top=top);
top = reduce_factor_levels(df, 'installer', 10)
reduce_factor_levels(df_test, 'installer', top=top);
drop = ['wpt_name', 'num_private', 'subvillage', 'region_code', 'district_code', 'lga', 'ward', 'recorded_by', 'scheme_name']
df.drop(drop, axis=1, inplace=True)
df_test.drop(drop, axis=1, inplace=True)
df.loc[df.scheme_management == 'None', 'scheme_management'] = ''
df.loc[df.scheme_management.isnull(), 'scheme_management'] = ''
df_test.loc[df_test.scheme_management.isnull(), 'scheme_management'] = ''
df['construction_date_known'] = (df.construction_year > 0).astype(np.int32)
df_test['construction_date_known'] = (df_test.construction_year > 0).astype(np.int32)
min_year = df[df.construction_year > 0].construction_year.min() // 10 - 1
df['construction_decade'] = df.construction_year // 10 - min_year
df_test['construction_decade'] = df_test.construction_year // 10 - min_year
df.loc[df.construction_decade < 0, 'construction_decade'] = 0
df_test.loc[df_test.construction_decade < 0, 'construction_decade'] = 0
top = reduce_factor_levels(df, 'construction_year', 20)
reduce_factor_levels(df_test, 'construction_year', top=top);
df.loc[df.extraction_type == 'other - mkulima/shinyanga', 'extraction_type'] = 'other'
heights = np.arange(-1, df.gps_height.max()+500, 500)
height_labels = list(range(len(heights)-1))
df['gps_height_rounded'] = pd.cut(df.gps_height, bins=heights, labels=height_labels)
df_test['gps_height_rounded'] = pd.cut(df_test.gps_height, bins=heights, labels=height_labels)
df.drop(['gps_height'], axis=1, inplace=True)
df_test.drop(['gps_height'], axis=1, inplace=True)
pops = np.arange(-1, df.population.max()+500, 500)
pops_labels = list(range(len(pops)-1))
df['pop_rounded'] = pd.cut(df.population, bins=pops, labels=pops_labels)
df_test['pop_rounded'] = pd.cut(df_test.population, bins=pops, labels=pops_labels)
df.drop(['population'], axis=1, inplace=True)
df_test.drop(['population'], axis=1, inplace=True)
df.drop(['date_recorded'], axis=1, inplace=True)
df_test.drop(['date_recorded'], axis=1, inplace=True)
df.public_meeting.fillna(True, inplace=True)
df_test.public_meeting.fillna(True, inplace=True)
df.permit.fillna(True, inplace=True)
df_test.permit.fillna(True, inplace=True)
df.gps_height_rounded.fillna(0, inplace=True)
df_test.gps_height_rounded.fillna(0, inplace=True)
X, y, X_test = df.drop(['id', 'status_group'], axis=1), \
df.status_group, \
df_test.drop(['id'], axis=1)
def prepare(X_train, X_test):
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction import DictVectorizer
objects = X_train.select_dtypes(include=['O']).columns.values
numeric = X_train.select_dtypes(exclude=['O']).columns.values
dv = DictVectorizer(sparse=False)
data_encoded_tr = dv.fit_transform(X_train[objects].to_dict(orient='records'))
data_encoded_ts = dv.transform(X_test[objects].to_dict(orient='records'))
ss = StandardScaler()
data_scaled_tr = ss.fit_transform(X_train[numeric])
data_scaled_ts = ss.transform(X_test[numeric])
train = np.hstack((data_encoded_tr, data_scaled_tr))
test = np.hstack((data_encoded_ts, data_scaled_ts))
return train, test
x_train, x_test = prepare(X, X_test)
from sklearn.preprocessing import LabelEncoder
y_encoder = LabelEncoder()
y = y_encoder.fit_transform(y)
Применив ансамблирование, стекинг и блэндинг улучшить вчерашние результаты
In [24]:
from sklearn.model_selection import cross_val_score
In [25]:
from lightgbm import LGBMClassifier
clf2 = LGBMClassifier(max_bin=460,learning_rate=0.11,n_estimators=140,num_leaves=130)
scores = cross_val_score(clf2, x_train, y)
np.mean(scores), 2*np.std(scores)
Out[25]:
In [26]:
clf2 = clf2.fit(x_train, y)
In [27]:
y_te = clf2.predict(x_test)
y_te
Out[27]:
In [28]:
ans_nn = pd.DataFrame({'id': df_test['id'], 'status_group': y_encoder.inverse_transform(y_te)})
ans_nn.head()
Out[28]:
In [29]:
#ans_nn.to_csv('ans_lightgbm.csv', index=False)
In [4]:
from sklearn.pipeline import make_pipeline
from mlxtend.classifier import EnsembleVoteClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
In [5]:
clfs = [make_pipeline(PolynomialFeatures(degree=2), LogisticRegression()),
LGBMClassifier(),
DecisionTreeClassifier()]
In [18]:
eclf = EnsembleVoteClassifier(clfs=clfs, voting='soft', weights=[1,2,1])
In [19]:
scores = cross_val_score(eclf, x_train, y)
np.mean(scores), 2*np.std(scores)
Out[19]:
Out: спустя 4 часа работы fit висит в памяти и ничего не выдает. При выполнении mean(scores)=0.76407, тут результат для 'hard' без весов.
In [ ]:
#eclf = eclf.fit(x_train, y)
In [ ]:
#y_te = eclf.predict(x_test)
#y_te
In [ ]:
#ans_nn = pd.DataFrame({'id': df_test['id'], 'status_group': y_encoder.inverse_transform(y_te)})
#ans_nn.head()
In [ ]:
#ans_nn.to_csv('ans_ansambl.csv', index=False)
In [20]:
import logging
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
#from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline
np.set_printoptions(precision=6)
np.set_printoptions(suppress=True)
np.random.seed(1000)
In [25]:
# create dataset
dataset = Dataset(x_train, y, x_test)
# initialize RandomForest & LinearRegression
model_rf = Regressor(dataset=dataset, estimator=RandomForestRegressor, parameters={'n_estimators': 50},name='rf')
model_lr = Regressor(dataset=dataset, estimator=LinearRegression, parameters={'normalize': True},name='lr')
In [26]:
# Stack two models
# Returns new dataset with out-of-fold predictions
pipeline = ModelsPipeline(model_rf,model_lr)
stack_ds = pipeline.stack(k=10,seed=111)
In [27]:
# Train LinearRegression on stacked data (second stage)
stacker = Regressor(dataset=stack_ds, estimator=LinearRegression)
results = stacker.predict()
In [29]:
results
Out[29]:
In [35]:
# Use randomly sampled 20% of the data as a holdout dataset
res = model_lr.validate(mean_absolute_error, test_size=0.20)
In [12]:
from mlxtend.classifier import StackingCVClassifier, StackingClassifier
from sklearn.ensemble import RandomForestClassifier
In [13]:
sclf = StackingCVClassifier(clfs, RandomForestClassifier())
In [ ]:
scores = cross_val_score(sclf, x_train, y)
np.mean(scores), 2*np.std(scores)
Out[ ]:
out: Windows BlueScreen with "unexpected store exception" спустя 2 часа работы + pyhon.exe not responding error
In [ ]:
#sclf = sclf.fit(x_train, y)
In [ ]:
#y_te = sclf.predict(x_test)
#y_te
In [ ]:
#ans_nn = pd.DataFrame({'id': df_test['id'], 'status_group': y_encoder.inverse_transform(y_te)})
#ans_nn.head()
In [ ]:
#ans_nn.to_csv('ans_stack.csv', index=False)