In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline
%load_ext ipycache

import pandas as pd
pd.options.display.max_rows = 300
pd.options.display.max_columns = 300
import lightgbm as lgb
import numpy as np
import scipy
import sklearn as sk
import xgboost as xgb

from eli5 import show_weights

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt


/Users/evgeny/Library/Python/2.7/lib/python/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated since IPython 4.0. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
# Вспомогательные функции
import math
from sklearn.metrics import make_scorer

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [
        (math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 
        for i,pred in enumerate(y_pred)
    ]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

def rmse(y, y_pred):
    return np.sqrt(((y_pred - y) ** 2).mean())

def feat_imp(model):
    return pd.DataFrame(
        model.get_fscore().items(), 
        columns=['feature','importance']
    ).sort_values('importance', ascending=False)

def unlog(y):
    return np.expm1(y)

rmse_scoring = make_scorer(rmse, greater_is_better=False)
rmsle_scoring = make_scorer(rmsle, greater_is_better=False)

Препроцессинг фич


In [3]:
def align_to_lb_score(df):
    # https://www.kaggle.com/c/sberbank-russian-housing-market/discussion/32717
    df = df.copy()
    trainsub = df[df.timestamp < '2015-01-01']
    trainsub = trainsub[trainsub.product_type=="Investment"]

    ind_1m = trainsub[trainsub.price_doc <= 1000000].index
    ind_2m = trainsub[trainsub.price_doc == 2000000].index
    ind_3m = trainsub[trainsub.price_doc == 3000000].index

    train_index = set(df.index.copy())

    for ind, gap in zip([ind_1m, ind_2m, ind_3m], [10, 3, 2]):
        ind_set = set(ind)
        ind_set_cut = ind.difference(set(ind[::gap]))

        train_index = train_index.difference(ind_set_cut)

    df = df.loc[train_index]
    df["price_doc"] = np.log1p(df["price_doc"].values)
    return df

def preprocess_anomaly(df):
    
    # удаляем из обучающей выборки все нулевые данные. В test данные все заполнены
    df = df.dropna(subset=["preschool_education_centers_raion", "num_room", 
                           "max_floor", "material", "kitch_sq", "floor"])

    df["product_type"].fillna("Investment", inplace=True)

    df["full_sq"] = map(lambda x: x if x > 10 else float("NaN"), df["full_sq"])
    df["life_sq"] = map(lambda x: x if x > 5 else float("NaN"), df["life_sq"])
    df["kitch_sq"] = map(lambda x: x if x > 2 else float("NaN"), df["kitch_sq"])
    
    # superclean
    # https://www.kaggle.com/keremt/very-extensive-cleaning-by-sberbank-discussions
    df.ix[df[df.life_sq > df.full_sq].index, "life_sq"] = np.NaN
    df.ix[df[df.kitch_sq >= df.life_sq].index, "kitch_sq"] = np.NaN

    df.ix[df[df.kitch_sq == 0].index, "kitch_sq"] = np.NaN
    df.ix[df[df.kitch_sq == 1].index, "kitch_sq"] = np.NaN

    df.ix[df[df.num_room == 0].index, "num_room"] = np.NaN
    
    df.ix[df[df.floor == 0].index, "floor"] = np.NaN
    df.ix[df[df.max_floor == 0].index, "max_floor"] = np.NaN
    
    df.ix[df[df.floor > df.max_floor].index, "max_floor"] = np.NaN
    
    df.ix[df[df.state == 33].index, "state"] = np.NaN
    
    df.ix[df[df.build_year == 20052009].index, "build_year"] = 2005
    df.ix[df[df.build_year == 20].index, "build_year"] = 2000
    df.ix[df[df.build_year == 215].index, "build_year"] = 2015

    df.ix[df[df.build_year < 1500].index, "build_year"] = np.NaN
    df.ix[df[df.build_year > 2022].index, "build_year"] = np.NaN
    
    return df

In [4]:
def smoothed_likelihood(targ_mean, nrows, globalmean, alpha=10):
    try:
        return (targ_mean * nrows + globalmean * alpha) / (nrows + alpha)
    except Exception:
        return float("NaN")

def mess_y_categorial(df, nfolds=3, alpha=10):
    from copy import copy

    folds = np.array_split(df, nfolds)
    newfolds = []
    for i in range(nfolds):
        fold = folds[i]

        other_folds = copy(folds)
        other_folds.pop(i)
        other_fold = pd.concat(other_folds)

        newfolds.append(mess_y_categorial_fold(fold, other_fold, alpha=10))

    return pd.concat(newfolds)

def mess_y_categorial_fold(fold_raw, other_fold, cols=None, y_col="price_doc", alpha=10):
    fold = fold_raw.copy()
    if not cols:
        cols = list(fold.select_dtypes(include=["object"]).columns)
    globalmean = other_fold[y_col].mean()
    for c in cols:

        target_mean = other_fold[[c, y_col]].fillna("").groupby(c).mean().to_dict()[y_col]
        nrows = other_fold[c].fillna("").value_counts().to_dict()

        fold[c + "_sll"] = fold[c].fillna("").apply(
            lambda x: smoothed_likelihood(target_mean.get(x), nrows.get(x), globalmean, alpha)
        )
    return fold

def feature_exclude(df):
    # Убираем build_year, вместо него остается age_of_building
    # Вероятно из-за build_year переобучение
    feats = ["build_year", "build_year_cat_le"]

    with open("greedy_search.tsv") as gs:
        for line in gs:
            row = line.strip().split("\t")
            if len(row) < 6:
                continue
            if row[5] == "remove":
                feats.append(row[0])

    df = df.drop(feats, axis=1, errors="ignore")

    return df

In [5]:
ALPHA = 50

lbl = sk.preprocessing.LabelEncoder()

def preprocess_categorial(df):

    for c in list(df.columns):
        if df[c].dtype == 'object':
            try:
                try:
                    lbl.fit(list(train_raw[c].values) + list(test[c].values) + list(df[c].values)) 
                except KeyError as e:
                    lbl.fit(df[c].values) 
                df[c + "_le"] = lbl.transform(list(df[c].values))
            except ValueError as e:
                print c, e
                raise

    df = mess_y_categorial(df, 5, alpha=ALPHA)

    df = df.select_dtypes(exclude=['object'])
    return df

def apply_categorial(test, train):
    for c in list(test.columns):
        if test[c].dtype == 'object':
            try:
                lbl.fit(list(train_raw[c].values) + list(test[c].values) + list(train[c].values)) 
            except KeyError:
                lbl.fit(test[c].values) 
            test[c + "_le"] = lbl.transform(list(test[c].values))

    test = mess_y_categorial_fold(test, train, alpha=ALPHA)

    test = test.select_dtypes(exclude=['object'])
    return test

def apply_macro(df):
    macro_cols = [
        'timestamp', "balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
        "micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
        "income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"
    ]
    return df.merge(macro[macro_cols], on='timestamp', how='left')

In [6]:
def preprocess(df):
    df = df.copy()

    ecology = ["no data", "poor", "satisfactory", "good", "excellent"]
    df["ecology_index"] = map(ecology.index, df["ecology"].values)

    df["age_of_building"] = df["timestamp"].apply(lambda x: x.split("-")[0]).astype(int) - df["build_year"]
    df["is_build_in_progress"] = df["age_of_building"].apply(lambda x: "yes" if x < 0 else "no")

    bool_feats = [
        "thermal_power_plant_raion",
        "incineration_raion",
        "oil_chemistry_raion",
        "radiation_raion",
        "railroad_terminal_raion",
        "big_market_raion",
        "nuclear_reactor_raion",
        "detention_facility_raion",
        "water_1line",
        "big_road1_1line",
        "railroad_1line",
        "culture_objects_top_25"
    ]
    for bf in bool_feats:
        try:
            df[bf + "_bool"] = map(lambda x: x == "yes", df[bf].values)
        except:
            pass

    df = preprocess_anomaly(df)

    df['rel_floor'] = df['floor'] / df['max_floor'].astype(float)
    df['rel_kitch_sq'] = df['kitch_sq'] / df['full_sq'].astype(float)
    df['rel_life_sq'] = df['life_sq'] / df['full_sq'].astype(float)

    df["material_cat"] = df.material.fillna(0).astype(int).astype(str).replace("0", "")
    df["state_cat"] = df.state.fillna(0).astype(int).astype(str).replace("0", "")
#     df["num_room_cat"] = df.num_room.fillna(0).astype(int).astype(str).replace("0", "")
#     df["build_year_cat"] = df.build_year.fillna(0).astype(int).astype(str).replace("0", "")
    df["build_year_ten"] = (df.build_year / 10).round()

    df["ID_metro"] = df.ID_metro.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_railroad_station_walk"] = df.ID_railroad_station_walk.replace("", "-10").fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_railroad_station_avto"] = df.ID_railroad_station_avto.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_big_road1"] = df.ID_big_road1.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_big_road2"] = df.ID_big_road2.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_bus_terminal"] = df.ID_bus_terminal.fillna(-10).astype(int).astype(str).replace("-10", "")

    #    # ratio of living area to full area #
    df["ratio_life_sq_full_sq"] = df["life_sq"] / np.maximum(df["full_sq"].astype("float"),1)
    df["ratio_life_sq_full_sq"].ix[df["ratio_life_sq_full_sq"]<0] = 0
    df["ratio_life_sq_full_sq"].ix[df["ratio_life_sq_full_sq"]>1] = 1

    #     # ratio of kitchen area to living area #
    df["ratio_kitch_sq_life_sq"] = df["kitch_sq"] / np.maximum(df["life_sq"].astype("float"),1)
    df["ratio_kitch_sq_life_sq"].ix[df["ratio_kitch_sq_life_sq"]<0] = 0
    df["ratio_kitch_sq_life_sq"].ix[df["ratio_kitch_sq_life_sq"]>1] = 1

    #     # ratio of kitchen area to full area #
    df["ratio_kitch_sq_full_sq"] = df["kitch_sq"] / np.maximum(df["full_sq"].astype("float"),1)
    df["ratio_kitch_sq_full_sq"].ix[df["ratio_kitch_sq_full_sq"]<0] = 0
    df["ratio_kitch_sq_full_sq"].ix[df["ratio_kitch_sq_full_sq"]>1] = 1
    df = df.drop(["timestamp"], axis=1, errors="ignore")

    return df

In [7]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv", index_col="id")
test = pd.read_csv("data/test.csv", index_col="id")
macro = pd.read_csv("data/macro.csv")

In [8]:
train_pr = align_to_lb_score(train_raw)
train_pr = preprocess(train_pr)
train_pr = preprocess_categorial(train_pr)
train = feature_exclude(train_pr)
train.head()


/Users/evgeny/Library/Python/2.7/lib/python/site-packages/pandas/core/generic.py:3549: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:32: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/pandas/core/indexing.py:517: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/pandas/core/indexing.py:179: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
Out[8]:
full_sq life_sq floor max_floor material num_room kitch_sq state area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion school_quota school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion culture_objects_top_25_raion shopping_centers_raion office_raion full_all male_f female_f young_all work_all ekder_all ekder_male ekder_female 0_6_all 0_6_male 7_14_all 7_14_male 7_14_female 0_17_all 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_slag build_count_mix build_count_before_1920 build_count_1921-1945 build_count_1971-1995 build_count_after_1995 metro_min_avto metro_km_avto metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km cemetery_km incineration_km railroad_station_walk_min railroad_station_avto_km railroad_station_avto_min public_transport_station_min_walk water_km mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km big_road2_km railroad_km zd_vokzaly_avto_km ID_railroad_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km catering_km green_part_500 prom_part_500 office_count_500 trc_count_500 trc_sqm_500 cafe_count_500 cafe_sum_500_min_price_avg cafe_sum_500_max_price_avg cafe_avg_price_500 cafe_count_500_na_price cafe_count_500_price_500 cafe_count_500_price_1000 cafe_count_500_price_high big_church_count_500 church_count_500 mosque_count_500 leisure_count_500 sport_count_500 green_part_1000 prom_part_1000 office_count_1000 office_sqm_1000 trc_count_1000 trc_sqm_1000 cafe_count_1000 cafe_sum_1000_min_price_avg cafe_sum_1000_max_price_avg cafe_avg_price_1000 cafe_count_1000_na_price cafe_count_1000_price_500 cafe_count_1000_price_1000 cafe_count_1000_price_1500 cafe_count_1000_price_2500 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 price_doc age_of_building incineration_raion_bool oil_chemistry_raion_bool big_market_raion_bool water_1line_bool railroad_1line_bool culture_objects_top_25_bool rel_floor rel_kitch_sq rel_life_sq build_year_ten ratio_life_sq_full_sq ratio_kitch_sq_life_sq ratio_kitch_sq_full_sq product_type_le sub_area_le culture_objects_top_25_le thermal_power_plant_raion_le incineration_raion_le oil_chemistry_raion_le radiation_raion_le railroad_terminal_raion_le big_market_raion_le nuclear_reactor_raion_le ID_metro_le ID_railroad_station_walk_le water_1line_le ID_big_road1_le ID_big_road2_le railroad_1line_le ID_bus_terminal_le ecology_le is_build_in_progress_le material_cat_le state_cat_le product_type_sll sub_area_sll culture_objects_top_25_sll thermal_power_plant_raion_sll incineration_raion_sll oil_chemistry_raion_sll radiation_raion_sll railroad_terminal_raion_sll big_market_raion_sll nuclear_reactor_raion_sll detention_facility_raion_sll ID_metro_sll ID_railroad_station_walk_sll ID_railroad_station_avto_sll water_1line_sll ID_big_road1_sll big_road1_1line_sll ID_big_road2_sll railroad_1line_sll ID_bus_terminal_sll ecology_sll is_build_in_progress_sll material_cat_sll state_cat_sll
id
7675 73.0 36.0 17.0 17.0 1.0 2.0 11.0 NaN 14883622.34 72131 0.024444 0.158249 7567 3848.0 4.0 8687.0 0 NaN 1 0 4 0 0 0 102828 47783 55045 13954 49242 8935 2488 6447 7567 3867 5731 3000 2731 15057 7273 26154 13689 12465 12659 6564 1204.0 12.0 793.0 36.0 179.0 14.0 97.0 64.0 9.0 298.0 382.0 42.0 176.0 1.786095 0.631513 0.631513 0.127746 0.211999 0.830687 0.800951 0.838160 2.650983 2.953143 34.680939 2.890078 4.609301 3.014127 0.810526 2.890537 10.868389 14.824881 15.565422 16.934712 2.223887 2.224715 1.881386 22.217934 5 8.379305 14.929271 1.007503 3.189669 9.700928 2.105310 12.015274 1.076679 0.811599 4.019799 4.019799 15.342829 3.517383 2.962486 14.068434 0.375622 5.862093 2.680298 2.285872 3.130025 0.222339 0.211999 1.897119 1.887930 9.372257 5.573473 5.052496 4.095476 0.642124 0.00 0.00 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 4.06 1.27 0 0 0 0 2 750.00 1250.00 1000.00 0 0 1 1 0 0 0 0 0 0 1 1 10.47 7.43 0 0 0 0 2 750.00 1250.00 1000.00 0 0 1 1 0 0 0 0 0 0 1 1 13.61 8.44 0 0 0 0 3 666.67 1166.67 916.67 0 0 2 1 0 0 0 1 3 0 0 1 1 15.92 8.28 0 0 3 62100 10 750.00 1250.00 1000.00 0 0 0 0 0 4 4 0 0 8 1 15.84 6.22 138650 20 405046 38 686.49 1 8 16 11 1 1 0 5 9 0 2 17 16.128046 NaN True False False False False False 1.000000 0.150685 0.493151 NaN 0.493151 0.305556 0.150685 0 45 0 0 1 0 1 0 0 0 12 25 0 2 55 0 7 1 0 0 0 15.801033 15.729914 15.717359 15.731408 15.631294 15.737393 15.812208 15.732827 15.741379 15.731524 15.729544 15.782371 15.738976 15.717732 15.742079 15.601210 15.732289 15.636870 15.736438 15.658769 15.757357 15.739732 15.706233 15.603761
8059 11.0 11.0 2.0 5.0 2.0 1.0 NaN 3.0 10071560.22 102726 0.048791 0.000000 6374 165.0 5.0 9337.0 1 4702.0 5 1 23 2 5 87 75377 34015 41362 14868 61102 26756 8775 17981 6374 3205 7538 3585 3953 16584 8501 14705 7343 7362 13042 6343 641.0 19.0 4.0 0.0 550.0 48.0 8.0 11.0 1.0 206.0 122.0 42.0 91.0 1.798776 1.291876 0.582523 0.377428 0.185809 0.985279 0.465981 1.659437 2.325364 14.002650 29.369625 4.238200 5.481681 2.639782 0.514685 12.114726 2.301037 0.189294 1.310001 2.109561 2.301037 2.902523 1.622346 4.742795 50 10.902881 4.169488 0.624461 5.162609 1.846188 4.797509 9.892177 3.609579 0.773059 1.255166 3.526529 2.069458 1.728451 0.551081 5.850548 1.222891 1.519220 0.185809 0.300637 0.112276 0.683229 0.197451 0.373226 0.264107 2.761752 2.985751 0.224448 1.592383 0.057430 0.17 0.00 14 1 2720 42 985.37 1597.56 1291.46 1 8 7 0 1 4 0 9 0 12.07 0.00 44 363707 2 10220 99 866.67 1416.67 1141.67 9 25 17 28 16 0 5 12 0 12 7 0 15.03 0.00 81 677136 5 122060 225 901.93 1487.92 1194.93 18 49 50 56 39 13 0 14 0 18 21 0 11.75 0.82 129 1394447 17 457342 474 912.41 1503.45 1207.93 39 103 115 114 69 30 4 24 45 0 24 36 3 9.80 1.21 258 3025460 28 1002718 979 933.08 1529.12 1231.10 69 217 143 63 12 60 102 1 44 79 4 9.38 4.35 10742760 83 3434795 2295 908.42 157 539 537 562 339 135 26 133 207 1 89 161 14.827112 106.0 False False False False False True 0.400000 NaN 1.000000 191.0 1.000000 NaN NaN 0 29 1 0 0 0 1 0 0 0 192 160 0 33 7 0 4 0 0 1 3 15.801033 16.524056 15.971515 15.731408 15.743042 15.737393 15.812208 15.732827 15.741379 15.731524 15.729544 16.191397 16.237215 16.391445 15.742079 15.949550 15.732289 16.025274 15.736438 15.948738 15.828350 15.739732 15.831025 15.838145
8114 85.0 NaN 13.0 22.0 4.0 3.0 NaN NaN 25536296.81 4001 0.496315 0.007122 275 NaN 0.0 NaN 0 NaN 0 0 0 0 1 0 17790 8350 9443 574 2566 861 244 617 275 143 264 136 128 646 311 3796 2035 1762 506 261 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.857265 2.285812 2.285812 1.402479 1.771738 5.118129 0.040645 0.288289 1.691135 19.639820 51.590936 4.299245 5.486797 0.855866 0.935755 7.683392 17.691722 20.412259 21.581123 22.450272 2.855063 3.550355 1.864007 26.263040 50 30.512432 17.961847 9.591465 6.035566 10.620355 4.266105 16.109519 6.997373 2.085627 3.085270 3.279429 14.462098 8.863268 7.339226 25.648392 2.616487 13.457175 9.891581 1.258676 4.329688 1.774558 1.771738 1.559771 1.540396 13.217829 14.023084 10.446626 5.115783 0.528478 38.39 2.83 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 40.95 1.52 0 0 0 0 2 1000.00 1750.00 1375.00 0 0 1 0 1 0 0 0 0 0 0 0 38.34 1.30 0 0 1 17000 2 1000.00 1750.00 1375.00 0 0 1 0 1 0 0 0 0 0 0 0 41.01 1.90 0 0 1 17000 3 833.33 1500.00 1166.67 0 0 2 0 1 0 0 1 3 0 0 0 0 44.85 1.97 0 0 2 22000 10 680.00 1200.00 940.00 0 1 1 0 0 1 3 0 0 6 0 35.53 5.77 117300 3 139300 18 758.82 1 3 8 4 1 1 0 2 12 0 0 9 15.840298 NaN False False False False False False 0.590909 NaN NaN NaN NaN NaN NaN 1 103 0 0 0 0 0 0 0 0 123 102 0 4 19 0 12 2 0 2 0 15.602368 15.670687 15.717359 15.731408 15.743042 15.737393 15.690301 15.732827 15.741379 15.731524 15.729544 15.668391 15.492049 15.691381 15.742079 15.503679 15.732289 15.634725 15.736438 15.582105 15.467725 15.739732 15.919001 15.603761
8138 53.0 30.0 10.0 16.0 1.0 2.0 8.0 3.0 5646405.14 79576 0.258663 0.101872 4857 2703.0 5.0 7236.0 0 NaN 3 0 4 0 3 3 68630 33005 35625 10019 51295 18262 5511 12751 4857 2424 4583 2341 2242 11158 5468 15292 7613 7679 8865 4433 301.0 9.0 71.0 47.0 71.0 13.0 84.0 4.0 2.0 0.0 52.0 108.0 96.0 1.959499 1.503698 1.535023 0.408673 0.364994 0.875814 0.529494 0.000000 1.987032 3.252337 42.776109 3.619377 4.524221 2.264785 1.162500 2.169200 11.018216 13.270117 13.854330 15.345902 2.116263 2.169200 0.619521 13.396969 101 19.134967 4.012248 0.563397 2.092171 4.335473 4.442725 9.819833 2.238991 0.821462 0.792641 2.875125 9.328147 2.044473 1.518608 5.627885 1.247381 5.880034 1.038248 0.777665 0.162024 0.447976 0.364994 1.410243 1.412551 3.985309 13.058708 7.737190 7.572115 0.659322 0.00 40.26 2 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 1 13.80 30.07 3 71754 4 92000 9 487.50 875.00 681.25 1 3 4 1 0 0 0 0 0 0 3 0 16.16 17.90 5 106254 7 105414 26 775.00 1270.83 1022.92 2 7 6 8 2 1 0 2 0 0 7 1 16.81 17.64 10 179554 12 305634 41 674.36 1141.03 907.69 2 11 15 10 2 1 0 3 3 0 0 10 2 12.36 16.69 14 529054 24 838534 70 742.42 1234.85 988.64 4 20 5 3 0 3 4 0 0 24 2 15.44 18.47 766701 48 2311301 142 736.15 12 34 43 39 10 4 0 11 21 1 0 46 16.012735 33.0 False False False False False False 0.625000 0.150943 0.566038 198.0 0.566038 0.266667 0.150943 0 55 0 0 0 0 1 0 0 0 202 224 0 43 0 0 0 3 0 0 3 15.801033 15.743523 15.717359 15.731408 15.743042 15.737393 15.812208 15.732827 15.741379 15.731524 15.729544 15.696960 15.772069 15.749849 15.742079 15.786004 15.732289 15.692321 15.736438 15.781152 15.840672 15.739732 15.706233 15.838145
8147 41.0 37.0 13.0 17.0 1.0 1.0 NaN NaN 21494094.80 7122 0.262459 0.017647 489 NaN 0.0 NaN 0 NaN 0 0 0 0 0 0 9553 4529 5024 1021 4568 1533 435 1099 489 254 469 242 228 1150 553 2155 1206 950 900 465 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.769542 2.155282 2.155282 0.618740 0.827115 2.031773 0.469741 1.281134 1.882638 11.672420 68.215230 5.684602 7.053650 5.603917 0.226807 7.370562 20.604213 23.724567 24.996049 25.701769 4.841627 5.344093 1.250001 28.972859 32 23.727404 17.317707 7.550486 4.835593 10.058643 9.568216 14.068657 6.187529 0.888687 5.093315 13.230213 21.811800 10.140839 4.375357 26.882114 2.710695 18.634244 9.017858 3.407867 5.350487 1.492907 0.827115 1.841105 0.630511 3.003040 21.372786 15.160570 13.306110 1.554069 0.76 0.00 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 10.51 0.00 0 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 1 0 0 1 0 12.80 0.42 0 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 0 1 0 11.64 1.45 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 2 0 0 0 1 2 0 0 3 0 11.09 2.01 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 0 0 1 3 0 0 5 0 17.97 4.45 0 5 262000 18 700.00 2 4 4 8 0 0 0 1 8 1 0 12 15.310076 NaN False False False False False False 0.764706 NaN 0.902439 NaN 0.902439 NaN NaN 1 105 0 0 0 0 0 0 0 0 164 134 0 11 31 0 13 2 0 0 0 15.602368 15.582945 15.717359 15.731408 15.743042 15.737393 15.690301 15.732827 15.741379 15.731524 15.729544 15.445493 15.394814 15.394814 15.742079 15.703291 15.732289 15.278771 15.736438 15.611353 15.467725 15.739732 15.706233 15.603761

In [9]:
important_feats = ["full_sq", "life_sq", "kitch_sq", "max_floor"]
# important_feats = ["full_sq", "life_sq"]
# Учим модели для заполнения NA важных полей, последовательность важна
feats_to_remove = ["price_doc", "rel_kitch_sq", "rel_life_sq", "id", "build_year_cat_le", 
                   "age_of_building", "rel_floor", "num_room_cat_le", "build_year_ten", 
                   "ratio_life_sq_full_sq", "ratio_kitch_sq_full_sq", "ratio_kitch_sq_life_sq"]

In [10]:
%%cache na_models.pkl na_models
na_models = {}
xgb_params = {
    'max_depth': 5,
    'n_estimators': 200,
    'learning_rate': 0.05,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
for f in important_feats:
    t = train[train[f].notnull()]
    fX = t.drop([f] + feats_to_remove, axis=1, errors="ignore")
    fy = t[f].values
    dtrain_all = xgb.DMatrix(fX.values, fy, feature_names=fX.columns)
    model = xgb.train(xgb_params, dtrain_all, num_boost_round=400, verbose_eval=40)
    na_models[f] = model
    print f
    print feat_imp(model).head(10)


[Skipped the cell's code and loaded variables na_models from file '/Users/evgeny/PycharmProjects/kaggle/sber/na_models.pkl'.]
full_sq
              feature  importance
202           life_sq        1924
78           kitch_sq         732
23          max_floor         559
171          num_room         431
195             floor         381
222   product_type_le         223
34   material_cat_sll         158
145     state_cat_sll         128
2    product_type_sll         112
90              state         110
life_sq
                     feature  importance
84                   full_sq        1605
72                  kitch_sq         591
23                 max_floor         524
85                     state         484
178                    floor         470
2           product_type_sll         264
157                 num_room         221
82   is_build_in_progress_le         203
34          material_cat_sll         201
137            state_cat_sll         178
kitch_sq
              feature  importance
92            full_sq        1507
157           life_sq         974
22          max_floor         597
189             floor         357
167          num_room         274
33   material_cat_sll         237
38           material         217
93              state         142
214   product_type_le         127
144     state_cat_sll         126
max_floor
              feature  importance
95            full_sq         912
202             floor         807
83           kitch_sq         649
185           life_sq         600
39   material_cat_sll         313
45           material         231
96              state         156
150     state_cat_sll         134
170   kindergarten_km         123
2    product_type_sll         121

In [11]:
def fill_na_xgb(df_orig):
    df = df_orig.copy()
    for f in important_feats:
        X_pr = df[df[f].isnull()].drop([f] + feats_to_remove, axis=1, errors="ignore")
        if not len(X_pr):
            continue
        X_pr = xgb.DMatrix(X_pr.values, feature_names=X_pr.columns)

        df.loc[df[f].isnull(), f] = na_models[f].predict(X_pr).round()
        df[f] = df[f].astype(int)
    return df

In [12]:
train = fill_na_xgb(train)

Обучение моделей


In [33]:
from sklearn.model_selection import train_test_split

X = train.drop(["price_doc"], axis=1)
y = train["price_doc"].values

bound = int(len(X) * 0.7)
X_train, X_val, y_train, y_val = X[:bound].copy(), X[bound+1:].copy(), y[:bound].copy(), y[bound+1:].copy()

XGBoost


In [34]:
dtrain_all = xgb.DMatrix(X.values, y, feature_names=X.columns)
dtrain = xgb.DMatrix(X_train.values, y_train, feature_names=X.columns)
dval = xgb.DMatrix(X_val.values, y_val, feature_names=X.columns)

In [35]:
xgb_params = {
    'eta': 0.01,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Uncomment to tune XGB `num_boost_rounds`
model = xgb.train(xgb_params, dtrain, num_boost_round=4000, evals=[(dval, 'val')],
                  early_stopping_rounds=20, verbose_eval=40)

num_boost_round = model.best_iteration


[0]	val-rmse:15.1085
Will train until val-rmse hasn't improved in 20 rounds.
[40]	val-rmse:10.1258
[80]	val-rmse:6.795
[120]	val-rmse:4.56841
[160]	val-rmse:3.08318
[200]	val-rmse:2.09569
[240]	val-rmse:1.44362
[280]	val-rmse:1.01767
[320]	val-rmse:0.74555
[360]	val-rmse:0.57788
[400]	val-rmse:0.479446
[440]	val-rmse:0.424684
[480]	val-rmse:0.395296
[520]	val-rmse:0.379696
[560]	val-rmse:0.37133
[600]	val-rmse:0.366656
[640]	val-rmse:0.363883
[680]	val-rmse:0.362267
[720]	val-rmse:0.361277
[760]	val-rmse:0.360505
[800]	val-rmse:0.359963
[840]	val-rmse:0.359534
[880]	val-rmse:0.359215
[920]	val-rmse:0.358886
[960]	val-rmse:0.358617
[1000]	val-rmse:0.358462
[1040]	val-rmse:0.358334
[1080]	val-rmse:0.358153
[1120]	val-rmse:0.357997
[1160]	val-rmse:0.357889
Stopping. Best iteration:
[1148]	val-rmse:0.357838


In [16]:
cv_output = xgb.cv(xgb_params, dtrain_all, num_boost_round=4000, 
                   verbose_eval=100, early_stopping_rounds=100, nfold=5)


[0]	train-rmse:15.0807+0.00143998	test-rmse:15.0807+0.00574543
[100]	train-rmse:5.53406+0.000562041	test-rmse:5.53423+0.00559221
[200]	train-rmse:2.05268+0.000396049	test-rmse:2.05461+0.00530618
[300]	train-rmse:0.809839+0.000644401	test-rmse:0.817379+0.00478175
[400]	train-rmse:0.412843+0.00126441	test-rmse:0.431577+0.00442863
[500]	train-rmse:0.316289+0.00163335	test-rmse:0.34555+0.00495535
[600]	train-rmse:0.294162+0.001633	test-rmse:0.33052+0.00537192
[700]	train-rmse:0.285144+0.00163495	test-rmse:0.32731+0.0056672
[800]	train-rmse:0.27898+0.00175593	test-rmse:0.326178+0.00576283
[900]	train-rmse:0.273645+0.00176602	test-rmse:0.325591+0.00585998
[1000]	train-rmse:0.268594+0.00162843	test-rmse:0.325201+0.00590836
[1100]	train-rmse:0.263946+0.00164506	test-rmse:0.324894+0.00600377
[1200]	train-rmse:0.259417+0.00146795	test-rmse:0.324697+0.00607405
[1300]	train-rmse:0.255426+0.00141924	test-rmse:0.324576+0.00604754
[1400]	train-rmse:0.251505+0.00135145	test-rmse:0.324488+0.00606445
[1500]	train-rmse:0.247599+0.00139538	test-rmse:0.32446+0.00607018
[1600]	train-rmse:0.24397+0.00147552	test-rmse:0.324437+0.00602382

In [36]:
xgbmodel = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_round, verbose_eval=40)

y_pred = xgbmodel.predict(dtrain)

print "predict-train:", rmse(y_pred, y_train)
submdf = pd.DataFrame({"id": X_train.index, "price_doc": unlog(y_pred)})
submdf.to_csv("xgb_train_preds.csv", index=False)

y_pred = xgbmodel.predict(dval)

print "predict-val:", rmse(y_pred, y_val)
submdf = pd.DataFrame({"id": X_val.index, "price_doc": unlog(y_pred)})
submdf.to_csv("xgb_val_preds.csv", index=False)

feat_imp(model).head(10)


predict-train: 0.247358441492
predict-val: 0.357843273327
Out[36]:
feature importance
114 full_sq 2183
130 age_of_building 650
192 life_sq 561
236 floor 545
28 max_floor 540
198 kindergarten_km 338
99 kitch_sq 297
44 rel_life_sq 296
115 state 278
83 workplaces_km 264

LightGBM


In [37]:
RS = 20170501
np.random.seed(RS)

FACT_ROUNDS=0
ROUNDS = 2000
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'learning_rate': 0.01,
#     'verbose': 1,
#     'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': RS,
#     'feature_fraction': 0.7,
#     'feature_fraction_seed': RS,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
#     'max_bin': 100,
    'max_depth': 10,
    'num_rounds': ROUNDS
}
lgb_train_all = lgb.Dataset(X, y)
lgb_train = lgb.Dataset(X_train, y_train)

In [38]:
cvres = pd.DataFrame(lgb.cv(params=lgb_params, train_set=lgb_train, nfold=5, shuffle=False, 
                            early_stopping_rounds=100, verbose_eval=100, num_boost_round=ROUNDS))
FACT_ROUNDS = len(cvres)


[100]	cv_agg's rmse: 0.360021 + 0.0125667
[200]	cv_agg's rmse: 0.329922 + 0.0117053
[300]	cv_agg's rmse: 0.321521 + 0.0115273
[400]	cv_agg's rmse: 0.318169 + 0.0113741
[500]	cv_agg's rmse: 0.31661 + 0.0112249
[600]	cv_agg's rmse: 0.315936 + 0.0111462
[700]	cv_agg's rmse: 0.315747 + 0.0108937
[800]	cv_agg's rmse: 0.315708 + 0.0106772
[900]	cv_agg's rmse: 0.315846 + 0.0105506

In [39]:
lgbmodel = lgb.train(lgb_params, lgb_train, num_boost_round=FACT_ROUNDS or ROUNDS)

pd.DataFrame({
    "name": lgbmodel.feature_name(), 
    "imp": lgbmodel.feature_importance()}
).sort_values("imp", ascending=False).head(20)


Out[39]:
imp name
0 1694 full_sq
236 961 age_of_building
3 470 max_floor
2 338 floor
272 333 sub_area_sll
282 302 ID_metro_sll
245 299 rel_life_sq
1 292 life_sq
7 260 state
283 254 ID_railroad_station_walk_sll
60 252 kindergarten_km
248 252 ratio_kitch_sq_life_sq
97 249 public_healthcare_km
130 240 prom_part_1000
70 232 public_transport_station_min_walk
111 231 green_part_500
6 229 kitch_sq
244 221 rel_kitch_sq
99 220 workplaces_km
102 214 additional_education_km

In [40]:
y_pred = lgbmodel.predict(X_train)

print "predict-train:", rmse(y_pred, y_train)
submdf = pd.DataFrame({"id": X_train.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lgb_train_preds.csv", index=False)

y_pred = lgbmodel.predict(X_val)

print "predict-val:", rmse(y_pred, y_val)
submdf = pd.DataFrame({"id": X_val.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lgb_val_preds.csv", index=False)


predict-train: 0.239486134262
predict-val: 0.356107088195

Vowpal Wabbit


In [123]:
from vowpalwabbit.sklearn_vw import VWRegressor

Lasso


In [29]:
from sklearn.base import TransformerMixin
from scipy.stats import skew

class SkewLogAlign(TransformerMixin):
    skewed_feats = None
    skew_treshold = 0.75

    def __init__(self, skew_treshold=0.75):
        self.skew_treshold = skew_treshold

    def fit(self, X, y=None):
        #log transform skewed numeric features:
        df = pd.DataFrame(X, dtype=np.float64)

        skewed_feats = df.apply(lambda x: skew(x.dropna())) #compute skewness
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        self.skewed_feats = skewed_feats.index
        return self

    def transform(self, X):
        df = pd.DataFrame(X, dtype=np.float64)
        df[self.skewed_feats] = np.log1p(df[self.skewed_feats].values)
        return df.values

import sys
class FillNaWithConstant(TransformerMixin):
    nan_value = 0
    inf_value = None
    minf_value = None
    def __init__(self, nan_value=0, inf_value=sys.maxint - 1, minf_value=-sys.maxint - 1):
        self.nan_value = nan_value
        self.inf_value = inf_value
        self.minf_value = minf_value
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = pd.DataFrame(X).fillna(self.nan_value)
        df = df.replace(np.inf, self.inf_value)
        df = df.replace(-np.inf, self.minf_value)
        return df.values

In [45]:
from sklearn.pipeline import Pipeline
lasso_feat_pipeline = Pipeline([
    ("skew", SkewLogAlign()),
    ("fillna", FillNaWithConstant()),
])

In [143]:
from sklearn.linear_model import LassoCV
LASSO_alphas = [1, 0.1, 0.001, 0.0005]
lasso_cv_model = LassoCV(alphas = [1, 0.1, 0.001, 0.0005], cv=5, max_iter=50000, verbose=True, n_jobs=-1)
lasso_cv_model.fit(lasso_feat_pipeline.transform(X.values), y)
print "alpha:", lasso_cv_model.alpha_
print "MSE:"
print zip(LASSO_alphas, np.sqrt(lasso_cv_model.mse_path_))
print pd.Series(lasso_cv_model.coef_, index=X.columns).sort_values(ascending=False)[:20]


/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: divide by zero encountered in log1p
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in log1p
....................[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.8s finished
alpha: 0.0005
MSE: [[ 0.21501046  0.23015306  0.23120176  0.23427819  0.30341004]
 [ 0.15488957  0.16189127  0.16288215  0.16238237  0.22927763]
 [ 0.11251749  0.1203526   0.10807458  0.12416594  0.16574154]
 [ 0.11215568  0.11882564  0.10656277  0.12069194  0.16437967]]

In [46]:
from sklearn.linear_model import Lasso

best_alpha = 0.001
lasso_model = Pipeline([
    ("feat", lasso_feat_pipeline),
    ("clf", Lasso(alpha=best_alpha, max_iter=50000))
])

lasso_model.fit(X_train.values, y_train)


/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: divide by zero encountered in log1p
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in log1p
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)
Out[46]:
Pipeline(steps=[('feat', Pipeline(steps=[('skew', <__main__.SkewLogAlign object at 0x10785e190>), ('fillna', <__main__.FillNaWithConstant object at 0x10785e750>)])), ('clf', Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=50000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [47]:
y_pred = lasso_model.predict(X_train.values)

print "predict-train:", rmse(y_pred, y_train)
submdf = pd.DataFrame({"id": X_train.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lasso_train_preds.csv", index=False)

y_pred = lasso_model.predict(X_val.values)

print "predict-validation:", rmse(y_pred, y_val)
submdf = pd.DataFrame({"id": X_val.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lasso_val_preds.csv", index=False)


/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: divide by zero encountered in log1p
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in log1p
predict-train: 0.333160734842
predict-validation: 0.407328141516

Submission


In [41]:
test_pr = preprocess(test)
train_pr = preprocess(train_raw)
test_pr = apply_categorial(test_pr, train_pr)
test_pr = feature_exclude(test_pr)

test_pr = fill_na_xgb(test_pr)


/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:32: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

XGBoost


In [42]:
# XGB
dtest = xgb.DMatrix(test_pr.values, feature_names=test_pr.columns)
y_pred = xgbmodel.predict(dtest)

submdf = pd.DataFrame({"id": test_pr.index, "price_doc": unlog(y_pred)})
submdf.to_csv("xgb_sub.csv", index=False)
!head xgb_sub.csv


id,price_doc
30474,4742232.5
30475,6859833.0
30476,4995267.5
30477,5349565.5
30478,4658073.0
30479,5667926.0
30480,4403455.0
30481,4455123.5
30482,4962353.0

LightGBM


In [43]:
y_pred = lgbmodel.predict(test_pr)
submdf = pd.DataFrame({"id": test_pr.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lgb_sub.csv", index=False)
!head lgb_sub.csv


id,price_doc
30474,6290504.09921
30475,9001023.32542
30476,6180642.37022
30477,7994646.047
30478,5992177.9955
30479,6900884.07179
30480,5299139.72796
30481,5150081.11666
30482,5607004.45297

Lasso


In [51]:
y_pred = lasso_model.predict(test_pr.values)
submdf = pd.DataFrame({"id": test_pr.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lasso_sub.csv", index=False)
!head lasso_sub.csv


id,price_doc
30474,5621747.59302
30475,8050329.46028
30476,5528565.7515
30477,6401597.65974
30478,4977892.1805
30479,7481715.86314
30480,4968474.93149
30481,4040944.24532
30482,5150697.61831
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: divide by zero encountered in log1p
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in log1p

Ensemble


In [46]:
models = ["lgb", "xgb"]

In [47]:
etrain = pd.DataFrame(index=X_val.index)

etrain = etrain.join(train[["price_doc"]])

for i, p in enumerate(models):
    pred = pd.read_csv("%s_val_preds.csv" % p, index_col="id", names=["id", "p_%s" % i], header=0)
    etrain = etrain.join(pred)

eX = etrain.drop("price_doc", axis=1)

ey = etrain["price_doc"].values

etrain.head()


Out[47]:
price_doc p_0 p_1
id
24320 15.952724 8.227190e+06 8096738.0
24321 15.621715 6.601231e+06 6750821.5
24322 15.919645 8.226800e+06 7768140.0
24324 15.645554 7.806705e+06 8211750.5
24325 15.279985 3.973034e+06 4257109.0

Lasso


In [48]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV
emodel = Pipeline([
    ("skew", SkewLogAlign()),
    ("fillna", FillNaWithConstant()),
    ("clf", LassoCV(alphas=None, cv=5, max_iter=50000, verbose=True, n_jobs=-1))
]) 
emodel.fit(eX.values, ey)

lmodel = emodel.named_steps["clf"]
print "alpha:", lmodel.alpha_
print "MSE:"
print np.sqrt(lmodel.mse_path_)
print pd.Series(lmodel.coef_, index=eX.columns).sort_values(ascending=False)[:20]


....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished
alpha: 0.000163371864509
MSE:
[[ 0.47509683  0.49749565  0.54020405  0.58316248  0.57470411]
 [ 0.45406655  0.47593357  0.5231112   0.56666981  0.56317275]
 [ 0.43491715  0.45635357  0.5078579   0.54906123  0.54306095]
 [ 0.41751708  0.43861495  0.49428616  0.53325188  0.52492149]
 [ 0.40174069  0.42258294  0.48224566  0.51908954  0.50859868]
 [ 0.38746783  0.40812849  0.47159411  0.50643002  0.49394406]
 [ 0.37458372  0.39511955  0.46219753  0.49513733  0.48081652]
 [ 0.36297881  0.38348797  0.4539305   0.48508373  0.4690824 ]
 [ 0.35254876  0.37308306  0.4466763   0.47614988  0.45861559]
 [ 0.34319447  0.36379772  0.44032699  0.46822479  0.44929767]
 [ 0.33482214  0.35553042  0.4347833   0.46120582  0.44101799]
 [ 0.32734333  0.34818582  0.42995451  0.45499847  0.43367362]
 [ 0.32062695  0.34167468  0.42573704  0.44950673  0.42716939]
 [ 0.31464964  0.33591415  0.42206894  0.44463321  0.42141765]
 [ 0.309339    0.33082708  0.41889381  0.44033892  0.41633814]
 [ 0.30462787  0.32634253  0.41615134  0.43655866  0.41185093]
 [ 0.30045424  0.32239553  0.41378768  0.43323364  0.4078631 ]
 [ 0.29676134  0.31892707  0.41175497  0.43031111  0.40434917]
 [ 0.29349742  0.3158829   0.41001076  0.42774391  0.40125579]
 [ 0.2906153   0.31321484  0.40851758  0.42548996  0.39853436]
 [ 0.28807244  0.31087898  0.40724239  0.42351184  0.39614156]
 [ 0.28583053  0.30883607  0.40615619  0.42177637  0.39403854]
 [ 0.28385505  0.30705106  0.40523357  0.42025409  0.39219088]
 [ 0.28211508  0.30549272  0.40445224  0.41891905  0.39056807]
 [ 0.28058316  0.30413364  0.40379278  0.41774829  0.38914276]
 [ 0.27923472  0.30294878  0.40323827  0.41672163  0.38789131]
 [ 0.27804795  0.30191686  0.40277398  0.41582127  0.38679222]
 [ 0.27700353  0.3010186   0.40238705  0.41503159  0.38582708]
 [ 0.27608441  0.30023709  0.40206646  0.41433887  0.3849793 ]
 [ 0.2752754   0.29955721  0.40180257  0.41373109  0.38423465]
 [ 0.27456319  0.29896673  0.40158699  0.41319765  0.38358025]
 [ 0.27393609  0.29845367  0.40141255  0.41272932  0.38300517]
 [ 0.27338373  0.2980081   0.40127311  0.41231801  0.38249928]
 [ 0.27289696  0.29762163  0.40116324  0.41195662  0.38205446]
 [ 0.27246786  0.29728618  0.40107834  0.41163895  0.38166296]
 [ 0.27208939  0.29699548  0.40101438  0.41135957  0.38131806]
 [ 0.27175539  0.29674359  0.40096815  0.41111374  0.38101443]
 [ 0.27146045  0.29652511  0.40093654  0.41089728  0.3807466 ]
 [ 0.27119983  0.29633633  0.40091717  0.41070658  0.38051041]
 [ 0.27096936  0.29617258  0.40090789  0.41053845  0.38030179]
 [ 0.27076545  0.29603128  0.40090681  0.41039013  0.38011761]
 [ 0.27058483  0.29590902  0.40091248  0.41025918  0.37995468]
 [ 0.27042471  0.2958033   0.40092349  0.41014347  0.37981066]
 [ 0.27028268  0.29571192  0.40093883  0.41004114  0.37968304]
 [ 0.27015658  0.29563327  0.40095739  0.40995057  0.37956987]
 [ 0.27004445  0.29556529  0.40097856  0.40987035  0.37946945]
 [ 0.26994471  0.29550685  0.4010016   0.40979921  0.37938026]
 [ 0.26985583  0.29545664  0.40102594  0.40973608  0.37930097]
 [ 0.26977664  0.29541322  0.40105113  0.40967999  0.37923063]
 [ 0.26970592  0.29537598  0.4010769   0.4096301   0.37916776]
 [ 0.26964273  0.29534407  0.40110271  0.4095857   0.37911173]
 [ 0.26958624  0.29531672  0.40112844  0.40954611  0.37906175]
 [ 0.2695356   0.29529332  0.40115378  0.4095108   0.37901691]
 [ 0.26949023  0.29527329  0.40117879  0.40947926  0.37897685]
 [ 0.26944945  0.29525617  0.40120311  0.40945105  0.37894104]
 [ 0.26941283  0.29524154  0.40122676  0.40942579  0.37890899]
 [ 0.26937987  0.29522905  0.40124957  0.40940315  0.37888008]
 [ 0.2693502   0.29521868  0.40127169  0.40938284  0.37885419]
 [ 0.26932342  0.29520951  0.40129278  0.40936459  0.37883076]
 [ 0.26929922  0.29520198  0.40131303  0.40934817  0.37880975]
 [ 0.26927737  0.29519583  0.40133244  0.40933338  0.3787907 ]
 [ 0.26925756  0.29519056  0.40135089  0.40932004  0.3787736 ]
 [ 0.26923959  0.29518603  0.40136851  0.409308    0.37875805]
 [ 0.2692233   0.29518243  0.40138521  0.40929712  0.37874389]
 [ 0.26920852  0.2951793   0.40140101  0.40928727  0.37873118]
 [ 0.26919506  0.29517687  0.40141604  0.40927834  0.37871957]
 [ 0.26918282  0.29517473  0.40143023  0.40927024  0.37870918]
 [ 0.26917163  0.29517312  0.4014437   0.40926289  0.37869943]
 [ 0.26916144  0.29517196  0.40145629  0.4092562   0.37869094]
 [ 0.26915215  0.29517088  0.40146823  0.40925012  0.37868291]
 [ 0.26914366  0.29517014  0.40147956  0.40924457  0.37867554]
 [ 0.26913587  0.29516939  0.4014901   0.40923951  0.37866896]
 [ 0.26912874  0.29516922  0.40150009  0.40923489  0.37866291]
 [ 0.26912218  0.29516895  0.40150947  0.40923067  0.37865732]
 [ 0.2691162   0.29516858  0.40151826  0.40922681  0.37865237]
 [ 0.26911068  0.29516868  0.40152649  0.40922326  0.37864759]
 [ 0.26910561  0.29516863  0.4015343   0.40922002  0.37864337]
 [ 0.26910096  0.2951687   0.40154162  0.40921704  0.37863947]
 [ 0.26909665  0.29516889  0.40154836  0.4092143   0.37863585]
 [ 0.26909271  0.29516886  0.40155477  0.4092118   0.37863249]
 [ 0.26908906  0.29516923  0.40156076  0.40920948  0.37862937]
 [ 0.26908569  0.29516937  0.40156635  0.40920736  0.37862646]
 [ 0.26908259  0.29516957  0.40157158  0.4092054   0.37862375]
 [ 0.26907973  0.29516982  0.40157646  0.40920359  0.37862123]
 [ 0.26907709  0.29517012  0.40158102  0.40920193  0.37861887]
 [ 0.26907466  0.29517047  0.40158528  0.40920039  0.37861666]
 [ 0.26907241  0.29517084  0.40158924  0.40919898  0.3786146 ]
 [ 0.26907033  0.29517124  0.40159294  0.40919767  0.37861267]
 [ 0.26906841  0.29517167  0.4015964   0.40919646  0.37861085]
 [ 0.26906664  0.29517211  0.4015997   0.40919534  0.37860936]
 [ 0.26906496  0.29517257  0.40160279  0.4091943   0.37860796]
 [ 0.26906341  0.29517303  0.40160567  0.40919334  0.37860665]
 [ 0.26906198  0.29517322  0.40160835  0.40919245  0.37860542]
 [ 0.26906065  0.29517343  0.40161085  0.40919162  0.37860427]
 [ 0.26905942  0.29517364  0.40161319  0.40919086  0.37860319]
 [ 0.26905828  0.29517387  0.40161536  0.40919015  0.37860217]
 [ 0.26905722  0.2951741   0.40161738  0.40918949  0.37860121]
 [ 0.26905624  0.29517434  0.40161926  0.40918888  0.37860031]
 [ 0.26905533  0.29517459  0.40162101  0.40918832  0.37859946]
 [ 0.26905449  0.29517484  0.40162264  0.40918779  0.37859866]]
p_0    0.737447
p_1    0.263791
dtype: float64

LightGBM


In [31]:
eFACT_ROUNDS = 0

In [ ]:
elgb_train = lgb.Dataset(eX, ey)
cvres = pd.DataFrame(lgb.cv(params=lgb_params, train_set=elgb_train, nfold=7, shuffle=False, 
                            early_stopping_rounds=100, verbose_eval=100, num_boost_round=ROUNDS))
eFACT_ROUNDS = len(cvres)

In [ ]:
emodel = lgb.train(lgb_params, elgb_train, num_boost_round=eFACT_ROUNDS or ROUNDS)

In [49]:
etest = test_pr[[]].copy()
for i, p in enumerate(models):
    pred = pd.read_csv("%s_sub.csv" % p, index_col="id", names=["id", "p_%s" % i], header=0)
    etest = etest.join(pred)

y_pred = emodel.predict(etest.values)
df = pd.DataFrame({"id": etest.index, "price_doc": unlog(y_pred)})
df.to_csv("ensemble_sub.csv", index=False)
!head ensemble_sub.csv


id,price_doc
30474,5991274.56122
30475,8601309.46811
30476,5995587.18151
30477,7380798.21396
30478,5753169.26788
30479,6723681.8604
30480,5177294.26037
30481,5085121.49278
30482,5570296.78356

Результаты испытаний

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude, xgb fillna, predict price meter:

val-rmse:42206.6
predict-train: 36746.0165399
kaggle: 0.31331

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude, xgb fillna, predict price doc:

val-rmse:2.57852e+06
train-rmse:1.90168e+06+26844.3  test-rmse:2.66642e+06+56338.9
predict-train: 2021259.19865
kaggle: 0.31386

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude, xgb fillna, predict price meter:

val-rmse:42206.6
predict-train: 36746.0165399
kaggle: 0.31331

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude:

val-rmse:2.55793e+06
train-rmse:1.74066e+06+28727.3  test-rmse:2.65025e+06+64969.5
predict-train: 1881896.66663
kaggle: 0.31344

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:

val-rmse:2.54654e+06
train-rmse:1.74594e+06+24020    test-rmse:2.66053e+06+67300.3
predict-train: 1883352.60935
kaggle: 0.31364

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:

val-rmse:2.55613e+06
train-rmse:1.74466e+06+27385.6  test-rmse:2.66422e+06+69734.1
predict-train: 1888051.35357
kaggle: 0.31366


5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro with other ID, ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:

val-rmse:2.58557e+06
train-rmse:1.98509e+06+26803.7  test-rmse:2.68755e+06+59691.1
predict-train: 2092731.29028
kaggle: 0.31731

#

5*200, no macro, add rel features, no log price, train_without_noise:

val-rmse:2.63772e+06
train-rmse:1.9989e+06+10986.4   test-rmse:2.69158e+06+53020
predict-train: 2076010.27131
kaggle: 0.31720

5*200, no macro, add rel features, no log price, train_with_noise:

val-rmse:2.53378e+06
train-rmse:1.95069e+06+16166.4  test-rmse:2.69703e+06+61455.1
predict-train: 2054421.59869
kaggle: 0.32056

5*200, macro, add rel features, no log price, train_without_noise:

val-rmse:2.79632e+06
train-rmse:1.81015e+06+19781.2  test-rmse:2.6641e+06+123875
predict-train: 1904063.27368
kaggle: 0.32976

5*200, no macro, add rel features, no log price, train_without_noise:

val-rmse:2.61682e+06
train-rmse:1.81123e+06+27681.2  test-rmse:2.66923e+06+53925.7
predict-train: 1899129.43771
kaggle: 0.31592

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.61055e+06
train-rmse:1.71826e+06+30076.1  test-rmse:2.66515e+06+54583.5
predict-train: 1814572.97424
kaggle: 0.31602

7*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.59955e+06
train-rmse:1.41393e+06+21208.1  test-rmse:2.6763e+06+35553.3
predict-train: 1548257.49121
kaggle: 0.31768

4*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.63407e+06
train-rmse:1.96513e+06+21470.8  test-rmse:2.69417e+06+74288.3
predict-train: 2062299.41091
kaggle: 0.31952

7*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.59955e+06
train-rmse:1.41393e+06+21208.1  test-rmse:2.6763e+06+35553.3
predict-train: 1548257.49121

5*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.61055e+06
train-rmse:1.71826e+06+30076.1  test-rmse:2.66515e+06+54583.5
predict-train: 1814572.97424

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna:

val-rmse:2.61664e+06
train-rmse:1.77892e+06+23111    test-rmse:2.65829e+06+56398.6
predict-train: 1875799.54634
kaggle: 0.31521

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean:

val-rmse:2.6265e+06
train-rmse:1.78478e+06+22545.4  test-rmse:2.66179e+06+60626.3
predict-train: 1881672.27588
kaggle: 0.31476

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, no super features + Label Encoding:

val-rmse:2.56494e+06
train-rmse:1.78862e+06+18589.1  test-rmse:2.69283e+06+79861.4
predict-train: 1923466.41923
kaggle: 0.31434

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, remove material state num_room:

val-rmse:2.56932e+06
train-rmse:1.88495e+06+20133.7  test-rmse:2.69624e+06+70491.2
predict-train: 1979198.19201
kaggle: 0.31513

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro/bus...:

val-rmse:2.60017e+06
train-rmse:1.80654e+06+19453.5  test-rmse:2.68203e+06+68169.5
predict-train: 1906439.98603
kaggle: 0.31927

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, remove 50 features:

val-rmse:2.93665e+06
train-rmse:1.73425e+06+19462.4  test-rmse:2.68682e+06+140661
predict-train: 1861268.6455
kaggle: 0.31555

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, remove 50 features, add ratio feats:

val-rmse:2.59747e+06
train-rmse:1.75828e+06+26639.4  test-rmse:2.68491e+06+67201.8
predict-train: 1875707.6581
kaggle: 0.31760

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, superfeatures + Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle:

val-rmse:2.5419e+06
train-rmse:1.74381e+06+22710.7  test-rmse:2.65787e+06+66889.9
predict-train: 1862467.67153
kaggle: 0.31716

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle:

val-rmse:2.5676e+06
train-rmse:1.81485e+06+24274    test-rmse:2.67324e+06+60153.1
predict-train: 1947645.83102
kaggle: 0.31376

Feature Greedy selection


In [45]:
from tqdm import tqdm
def get_best_score(train):
    xgb_params = {
        'max_depth': 5,
        'n_estimators': 200,
        'learning_rate': 0.01,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }
    cvres = xgb.cv(xgb_params, train, num_boost_round=4000, early_stopping_rounds=40)
    return cvres["test-rmse-mean"].min(), cvres["test-rmse-mean"].argmin()

def df2DMatrix(df):
    return xgb.DMatrix(data=df.drop("price_doc", axis=1).values, label=df["price_doc"].values)

def greedy_remove_features(df, feature_importances):
    train = df
    with open("greedy_search.tsv", "a") as f:
        best_score, iterno = get_best_score(df2DMatrix(df))
        f.write("\t".join(["INITIAL", str(best_score), str(iterno)]) + "\n")
        to_analyze = sorted(feature_importances.items(), key=lambda x: x[1])
        for feat, feat_importance in tqdm(to_analyze):
            f.flush()
            candidate_train = train.drop(feat, axis=1)
            cand_best_score, iterno = get_best_score(df2DMatrix(candidate_train))

            if cand_best_score > best_score:
                # стало хуже, оставляем фичу
                f.write("\t".join([feat, str(cand_best_score), str(best_score), str(feat_importance), str(iterno), "skip"]) + "\n")
                f.flush()
                continue

            f.write("\t".join([feat, str(cand_best_score), str(best_score), str(feat_importance), str(iterno), "remove"]) + "\n")
            best_score = cand_best_score
            train = candidate_train

In [47]:
feature_importances = imp_features.set_index("feature").to_dict()["importance"]

train_gs = train
with open("greedy_search.tsv") as gs:
    for line in gs:
        row = line.strip().split("\t")
        if len(row) < 6:
            continue
        if row[5] == "remove":
            try:
                train_gs = train_gs.drop(row[0], axis=1)
            except ValueError:
                pass
            print "drop", row[0]
        feature_importances.pop(row[0], None)

greedy_remove_features(train_gs, feature_importances)


drop 0_6_female
drop young_female
drop market_count_500
drop cafe_count_500_price_4000
drop nuclear_reactor_raion_bool
drop work_male
drop radiation_raion_bool
drop 0_13_female
drop detention_facility_raion_bool
drop thermal_power_plant_raion_bool
drop work_female
drop ecology_index
drop 0_17_male
drop railroad_terminal_raion_bool
drop church_count_1500
drop big_road1_1line_bool
drop additional_education_raion
drop cafe_count_1000_price_4000
drop cafe_count_3000_price_1500
drop office_count_5000
drop children_school
drop cafe_avg_price_5000
drop build_count_1946-1970
drop school_education_centers_raion
drop build_count_foam
drop market_count_5000
drop cafe_count_3000_price_1000
drop cafe_sum_5000_max_price_avg
drop cafe_count_500_price_2500
drop cafe_count_500_price_1500
100%|██████████| 123/123 [20:40:45<00:00, 613.14s/it]  

Получаем преобразованные train/test


In [168]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()


Out[168]:
id timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km catering_km ecology green_part_500 prom_part_500 office_count_500 office_sqm_500 trc_count_500 trc_sqm_500 cafe_count_500 cafe_sum_500_min_price_avg cafe_sum_500_max_price_avg cafe_avg_price_500 cafe_count_500_na_price cafe_count_500_price_500 cafe_count_500_price_1000 cafe_count_500_price_1500 cafe_count_500_price_2500 cafe_count_500_price_4000 cafe_count_500_price_high big_church_count_500 church_count_500 mosque_count_500 leisure_count_500 sport_count_500 market_count_500 green_part_1000 prom_part_1000 office_count_1000 office_sqm_1000 trc_count_1000 trc_sqm_1000 cafe_count_1000 cafe_sum_1000_min_price_avg cafe_sum_1000_max_price_avg cafe_avg_price_1000 cafe_count_1000_na_price cafe_count_1000_price_500 cafe_count_1000_price_1000 cafe_count_1000_price_1500 cafe_count_1000_price_2500 cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
0 1 2011-08-20 43 27.0 4.0 NaN NaN NaN NaN NaN NaN Investment Bibirevo 6.407578e+06 155572 0.189727 0.000070 9576 5001.0 5.0 10309 11065.0 5 0 240.0 1 0 7 3 no 0 16 1 no no no no no no no no 86206 40477 45729 21154 11007 10147 98207 52277 45930 36211 10580 25631 9576 4899 4677 10309 5463 4846 23603 12286 11317 17508 9425 8083 18654 9709 8945 211.0 25.0 0.0 0.0 0.0 2.0 184.0 0.0 0.0 0.0 211.0 0.0 0.0 0.0 206.0 5.0 1 2.590241 1.131260 13.575119 1.131260 0.145700 0.177975 2.158587 0.600973 1.080934 23.683460 1.804127 3.633334 5.419893 65.038716 1.0 5.419893 6.905893 1 0.274985 3.299822 0.992631 no 1.422391 10.918587 13.100618 13.675657 15.156211 1.422391 1 no 3.830951 5 1.305159 no 14.231961 101 24.292406 1 18.152338 5.718519 1.210027 1.062513 5.814135 4.308127 10.814172 1.676258 0.485841 3.065047 1.107594 8.148591 3.516513 2.392353 4.248036 0.974743 6.715026 0.884350 0.648488 0.637189 0.947962 0.177975 0.625783 0.628187 3.932040 14.053047 7.389498 7.023705 0.516838 good 0.00 0.00 0 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 0 0 1 0 7.36 0.00 1 30500 3 55600 19 527.78 888.89 708.33 1 10 4 3 1 0 0 1 2 0 0 6 1 14.27 6.92 3 39554 9 171420 34 566.67 969.70 768.18 1 14 11 6 2 0 0 1 2 0 0 7 1 11.77 15.97 9 188854 19 1244891 36 614.29 1042.86 828.57 1 15 11 6 2 1 0 1 2 0 0 10 1 11.98 13.55 12 251554 23 1419204 68 639.68 1079.37 859.52 5 21 22 16 3 1 0 2 4 0 0 21 1 13.09 13.31 29 807385 52 4036616 152 708.57 1185.71 947.14 12 39 48 40 9 4 0 13 22 1 0 52 4 5850000
1 2 2011-08-23 34 19.0 3.0 NaN NaN NaN NaN NaN NaN Investment Nagatinskij Zaton 9.589337e+06 115352 0.372602 0.049637 6880 3119.0 5.0 7759 6237.0 8 0 229.0 1 0 6 1 yes 1 3 0 no no no no no no no no 76284 34200 42084 15727 7925 7802 70194 35622 34572 29431 9266 20165 6880 3466 3414 7759 3909 3850 17700 8998 8702 15164 7571 7593 13729 6929 6800 245.0 83.0 1.0 0.0 67.0 4.0 90.0 0.0 0.0 0.0 244.0 1.0 1.0 143.0 84.0 15.0 2 0.936700 0.647337 7.620630 0.635053 0.147754 0.273345 0.550690 0.065321 0.966479 1.317476 4.655004 8.648587 3.411993 40.943917 2.0 3.641773 4.679745 2 0.065263 0.783160 0.698081 no 9.503405 3.103996 6.444333 8.132640 8.698054 2.887377 2 no 3.103996 4 0.694536 no 9.242586 32 5.706113 2 9.034642 3.489954 2.724295 1.246149 3.419574 0.725560 6.910568 3.424716 0.668364 2.000154 8.972823 6.127073 1.161579 2.543747 12.649879 1.477723 1.852560 0.686252 0.519311 0.688796 1.072315 0.273345 0.967821 0.471447 4.841544 6.829889 0.709260 2.358840 0.230287 excellent 25.14 0.00 0 0 0 0 5 860.00 1500.00 1180.00 0 1 3 0 0 1 0 0 1 0 0 0 0 26.66 0.07 2 86600 5 94065 13 615.38 1076.92 846.15 0 5 6 1 0 1 0 1 2 0 4 2 0 21.53 7.71 3 102910 7 127065 17 694.12 1205.88 950.00 0 6 7 1 2 1 0 1 5 0 4 9 0 22.37 19.25 4 165510 8 179065 21 695.24 1190.48 942.86 0 7 8 3 2 1 0 1 5 0 4 11 0 18.07 27.32 12 821986 14 491565 30 631.03 1086.21 858.62 1 11 11 4 2 1 0 1 7 0 6 19 1 10.26 27.47 66 2690465 40 2034942 177 673.81 1148.81 911.31 9 49 65 36 15 3 0 15 29 1 10 66 14 6000000
2 3 2011-08-27 43 29.0 2.0 NaN NaN NaN NaN NaN NaN Investment Tekstil'shhiki 4.808270e+06 101708 0.112560 0.118537 5879 1463.0 4.0 6207 5580.0 7 0 1183.0 1 0 5 1 no 0 0 1 no no no yes no no no no 101982 46076 55906 13028 6835 6193 63388 31813 31575 25292 7609 17683 5879 3095 2784 6207 3269 2938 14884 7821 7063 19401 9045 10356 11252 5916 5336 330.0 59.0 0.0 0.0 206.0 4.0 60.0 0.0 1.0 0.0 330.0 1.0 0.0 246.0 63.0 20.0 3 2.120999 1.637996 17.351515 1.445960 0.049102 0.158072 0.374848 0.453172 0.939275 4.912660 3.381083 11.996480 1.277658 15.331896 3.0 1.277658 1.701420 3 0.328756 3.945073 0.468265 no 5.604800 2.927487 6.963403 8.054252 9.067885 0.647250 3 no 2.927487 4 0.700691 no 9.540544 5 6.710302 3 5.777394 7.506612 0.772216 1.602183 3.682455 3.562188 5.752368 1.375443 0.733101 1.239304 1.978517 0.767569 1.952771 0.621357 7.682303 0.097144 0.841254 1.510089 1.486533 1.543049 0.391957 0.158072 3.178751 0.755946 7.922152 4.273200 3.156423 4.958214 0.190462 poor 1.67 0.00 0 0 0 0 3 666.67 1166.67 916.67 0 0 2 1 0 0 0 0 0 0 0 0 0 4.99 0.29 0 0 0 0 9 642.86 1142.86 892.86 2 0 5 2 0 0 0 0 1 0 0 5 3 9.92 6.73 0 0 1 2600 14 516.67 916.67 716.67 2 4 6 2 0 0 0 0 4 0 0 6 5 12.99 12.75 4 100200 7 52550 24 563.64 977.27 770.45 2 8 9 4 1 0 0 0 4 0 0 8 5 12.14 26.46 8 110856 7 52550 41 697.44 1192.31 944.87 2 9 17 9 3 1 0 0 11 0 0 20 6 13.69 21.58 43 1478160 35 1572990 122 702.68 1196.43 949.55 10 29 45 25 10 3 0 11 27 0 4 67 10 5700000
3 4 2011-09-01 89 50.0 9.0 NaN NaN NaN NaN NaN NaN Investment Mitino 1.258354e+07 178473 0.194703 0.069753 13087 6839.0 9.0 13670 17063.0 10 0 NaN 1 0 17 6 no 0 11 4 no no no no no no no no 21155 9828 11327 28563 14680 13883 120381 60040 60341 29529 9083 20446 13087 6645 6442 13670 7126 6544 32063 16513 15550 3292 1450 1842 24934 12782 12152 458.0 9.0 51.0 12.0 124.0 50.0 201.0 0.0 9.0 2.0 459.0 13.0 24.0 40.0 130.0 252.0 4 1.489049 0.984537 11.565624 0.963802 0.179441 0.236455 0.078090 0.106125 0.451173 15.623710 2.017080 14.317640 4.291432 51.497190 4.0 3.816045 5.271136 4 0.131597 1.579164 1.200336 no 2.677824 14.606501 17.457198 18.309433 19.487005 2.677824 1 no 2.780449 17 1.999265 no 17.478380 83 6.734618 1 27.667863 9.522538 6.348716 1.767612 11.178333 0.583025 27.892717 0.811275 0.623484 1.950317 6.483172 7.385521 4.923843 3.549558 8.789894 2.163735 10.903161 0.622272 0.599914 0.934273 0.892674 0.236455 1.031777 1.561505 15.300449 16.990677 16.041521 5.029696 0.465820 good 17.36 0.57 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 2 0 0 0 0 0 0 0 0 0 19.25 10.35 1 11000 6 80780 12 658.33 1083.33 870.83 0 3 4 5 0 0 0 0 0 0 0 3 1 28.38 6.57 2 11000 7 89492 23 673.91 1130.43 902.17 0 5 9 8 1 0 0 1 0 0 0 9 2 32.29 5.73 2 11000 7 89492 25 660.00 1120.00 890.00 0 5 11 8 1 0 0 1 1 0 0 13 2 20.79 3.57 4 167000 12 205756 32 718.75 1218.75 968.75 0 5 14 10 3 0 0 1 2 0 0 18 3 14.18 3.89 8 244166 22 942180 61 931.58 1552.63 1242.11 4 7 21 15 11 2 1 4 4 0 0 26 3 13100000
4 5 2011-09-05 77 77.0 4.0 NaN NaN NaN NaN NaN NaN Investment Basmannoe 8.398461e+06 108171 0.015234 0.037316 5706 3240.0 7.0 6748 7770.0 9 0 562.0 4 2 25 2 no 0 10 93 no no no yes yes no no no 28179 13522 14657 13368 7159 6209 68043 34236 33807 26760 8563 18197 5706 2982 2724 6748 3664 3084 15237 8113 7124 5164 2583 2581 11631 6223 5408 746.0 48.0 0.0 0.0 643.0 16.0 35.0 0.0 3.0 1.0 746.0 371.0 114.0 146.0 62.0 53.0 5 1.257186 0.876620 8.266305 0.688859 0.247901 0.376838 0.258289 0.236214 0.392871 10.683540 2.936581 11.903910 0.853960 10.247521 5.0 1.595898 2.156284 113 0.071480 0.857764 0.820294 no 11.616653 1.721834 0.046810 0.787593 2.578671 1.721834 4 no 3.133531 10 0.084113 yes 1.595898 113 1.423428 4 6.515857 8.671016 1.638318 3.632640 4.587917 2.609420 9.155057 1.969738 0.220288 2.544696 3.975401 3.610754 0.307915 1.864637 3.779781 1.121703 0.991683 0.892668 0.429052 0.077901 0.810801 0.376838 0.378756 0.121681 2.584370 1.112486 1.800125 1.339652 0.026102 excellent 3.56 4.44 15 293699 1 45000 48 702.22 1166.67 934.44 3 17 10 11 7 0 0 1 4 0 2 3 0 3.34 8.29 46 420952 3 158200 153 763.45 1272.41 1017.93 8 39 45 39 19 2 1 7 12 0 6 7 0 4.12 4.83 93 1195735 9 445900 272 766.80 1272.73 1019.76 19 70 74 72 30 6 1 18 30 0 10 14 2 4.53 5.02 149 1625130 17 564843 483 765.93 1269.23 1017.58 28 130 129 131 50 14 1 35 61 0 17 21 3 5.06 8.62 305 3420907 60 2296870 1068 853.03 1410.45 1131.74 63 266 267 262 149 57 4 70 121 1 40 77 5 8.38 10.92 689 8404624 114 3503058 2283 853.88 1411.45 1132.66 143 566 578 552 319 108 17 135 236 2 91 195 14 16331452

In [169]:
train_new_pr = feature_exclude(preprocess_categorial(preprocess(train_raw, dropid=False)))
test_new_pr = feature_exclude(preprocess_categorial(preprocess(test, dropid=False)))

# нужно сделать fillna, чтобы получить филлеры для NA из моделей
filled_train = fill_na_xgb(train_new_pr)
filled_test = fill_na_xgb(test_new_pr)

filled_train = filled_train.set_index("id")
filled_test = filled_test.set_index("id")

In [175]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()


Out[175]:
id timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km catering_km ecology green_part_500 prom_part_500 office_count_500 office_sqm_500 trc_count_500 trc_sqm_500 cafe_count_500 cafe_sum_500_min_price_avg cafe_sum_500_max_price_avg cafe_avg_price_500 cafe_count_500_na_price cafe_count_500_price_500 cafe_count_500_price_1000 cafe_count_500_price_1500 cafe_count_500_price_2500 cafe_count_500_price_4000 cafe_count_500_price_high big_church_count_500 church_count_500 mosque_count_500 leisure_count_500 sport_count_500 market_count_500 green_part_1000 prom_part_1000 office_count_1000 office_sqm_1000 trc_count_1000 trc_sqm_1000 cafe_count_1000 cafe_sum_1000_min_price_avg cafe_sum_1000_max_price_avg cafe_avg_price_1000 cafe_count_1000_na_price cafe_count_1000_price_500 cafe_count_1000_price_1000 cafe_count_1000_price_1500 cafe_count_1000_price_2500 cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
0 1 2011-08-20 43 27.0 4.0 NaN NaN NaN NaN NaN NaN Investment Bibirevo 6.407578e+06 155572 0.189727 0.000070 9576 5001.0 5.0 10309 11065.0 5 0 240.0 1 0 7 3 no 0 16 1 no no no no no no no no 86206 40477 45729 21154 11007 10147 98207 52277 45930 36211 10580 25631 9576 4899 4677 10309 5463 4846 23603 12286 11317 17508 9425 8083 18654 9709 8945 211.0 25.0 0.0 0.0 0.0 2.0 184.0 0.0 0.0 0.0 211.0 0.0 0.0 0.0 206.0 5.0 1 2.590241 1.131260 13.575119 1.131260 0.145700 0.177975 2.158587 0.600973 1.080934 23.683460 1.804127 3.633334 5.419893 65.038716 1.0 5.419893 6.905893 1 0.274985 3.299822 0.992631 no 1.422391 10.918587 13.100618 13.675657 15.156211 1.422391 1 no 3.830951 5 1.305159 no 14.231961 101 24.292406 1 18.152338 5.718519 1.210027 1.062513 5.814135 4.308127 10.814172 1.676258 0.485841 3.065047 1.107594 8.148591 3.516513 2.392353 4.248036 0.974743 6.715026 0.884350 0.648488 0.637189 0.947962 0.177975 0.625783 0.628187 3.932040 14.053047 7.389498 7.023705 0.516838 good 0.00 0.00 0 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 0 0 1 0 7.36 0.00 1 30500 3 55600 19 527.78 888.89 708.33 1 10 4 3 1 0 0 1 2 0 0 6 1 14.27 6.92 3 39554 9 171420 34 566.67 969.70 768.18 1 14 11 6 2 0 0 1 2 0 0 7 1 11.77 15.97 9 188854 19 1244891 36 614.29 1042.86 828.57 1 15 11 6 2 1 0 1 2 0 0 10 1 11.98 13.55 12 251554 23 1419204 68 639.68 1079.37 859.52 5 21 22 16 3 1 0 2 4 0 0 21 1 13.09 13.31 29 807385 52 4036616 152 708.57 1185.71 947.14 12 39 48 40 9 4 0 13 22 1 0 52 4 5850000
1 2 2011-08-23 34 19.0 3.0 NaN NaN NaN NaN NaN NaN Investment Nagatinskij Zaton 9.589337e+06 115352 0.372602 0.049637 6880 3119.0 5.0 7759 6237.0 8 0 229.0 1 0 6 1 yes 1 3 0 no no no no no no no no 76284 34200 42084 15727 7925 7802 70194 35622 34572 29431 9266 20165 6880 3466 3414 7759 3909 3850 17700 8998 8702 15164 7571 7593 13729 6929 6800 245.0 83.0 1.0 0.0 67.0 4.0 90.0 0.0 0.0 0.0 244.0 1.0 1.0 143.0 84.0 15.0 2 0.936700 0.647337 7.620630 0.635053 0.147754 0.273345 0.550690 0.065321 0.966479 1.317476 4.655004 8.648587 3.411993 40.943917 2.0 3.641773 4.679745 2 0.065263 0.783160 0.698081 no 9.503405 3.103996 6.444333 8.132640 8.698054 2.887377 2 no 3.103996 4 0.694536 no 9.242586 32 5.706113 2 9.034642 3.489954 2.724295 1.246149 3.419574 0.725560 6.910568 3.424716 0.668364 2.000154 8.972823 6.127073 1.161579 2.543747 12.649879 1.477723 1.852560 0.686252 0.519311 0.688796 1.072315 0.273345 0.967821 0.471447 4.841544 6.829889 0.709260 2.358840 0.230287 excellent 25.14 0.00 0 0 0 0 5 860.00 1500.00 1180.00 0 1 3 0 0 1 0 0 1 0 0 0 0 26.66 0.07 2 86600 5 94065 13 615.38 1076.92 846.15 0 5 6 1 0 1 0 1 2 0 4 2 0 21.53 7.71 3 102910 7 127065 17 694.12 1205.88 950.00 0 6 7 1 2 1 0 1 5 0 4 9 0 22.37 19.25 4 165510 8 179065 21 695.24 1190.48 942.86 0 7 8 3 2 1 0 1 5 0 4 11 0 18.07 27.32 12 821986 14 491565 30 631.03 1086.21 858.62 1 11 11 4 2 1 0 1 7 0 6 19 1 10.26 27.47 66 2690465 40 2034942 177 673.81 1148.81 911.31 9 49 65 36 15 3 0 15 29 1 10 66 14 6000000
2 3 2011-08-27 43 29.0 2.0 NaN NaN NaN NaN NaN NaN Investment Tekstil'shhiki 4.808270e+06 101708 0.112560 0.118537 5879 1463.0 4.0 6207 5580.0 7 0 1183.0 1 0 5 1 no 0 0 1 no no no yes no no no no 101982 46076 55906 13028 6835 6193 63388 31813 31575 25292 7609 17683 5879 3095 2784 6207 3269 2938 14884 7821 7063 19401 9045 10356 11252 5916 5336 330.0 59.0 0.0 0.0 206.0 4.0 60.0 0.0 1.0 0.0 330.0 1.0 0.0 246.0 63.0 20.0 3 2.120999 1.637996 17.351515 1.445960 0.049102 0.158072 0.374848 0.453172 0.939275 4.912660 3.381083 11.996480 1.277658 15.331896 3.0 1.277658 1.701420 3 0.328756 3.945073 0.468265 no 5.604800 2.927487 6.963403 8.054252 9.067885 0.647250 3 no 2.927487 4 0.700691 no 9.540544 5 6.710302 3 5.777394 7.506612 0.772216 1.602183 3.682455 3.562188 5.752368 1.375443 0.733101 1.239304 1.978517 0.767569 1.952771 0.621357 7.682303 0.097144 0.841254 1.510089 1.486533 1.543049 0.391957 0.158072 3.178751 0.755946 7.922152 4.273200 3.156423 4.958214 0.190462 poor 1.67 0.00 0 0 0 0 3 666.67 1166.67 916.67 0 0 2 1 0 0 0 0 0 0 0 0 0 4.99 0.29 0 0 0 0 9 642.86 1142.86 892.86 2 0 5 2 0 0 0 0 1 0 0 5 3 9.92 6.73 0 0 1 2600 14 516.67 916.67 716.67 2 4 6 2 0 0 0 0 4 0 0 6 5 12.99 12.75 4 100200 7 52550 24 563.64 977.27 770.45 2 8 9 4 1 0 0 0 4 0 0 8 5 12.14 26.46 8 110856 7 52550 41 697.44 1192.31 944.87 2 9 17 9 3 1 0 0 11 0 0 20 6 13.69 21.58 43 1478160 35 1572990 122 702.68 1196.43 949.55 10 29 45 25 10 3 0 11 27 0 4 67 10 5700000
3 4 2011-09-01 89 50.0 9.0 NaN NaN NaN NaN NaN NaN Investment Mitino 1.258354e+07 178473 0.194703 0.069753 13087 6839.0 9.0 13670 17063.0 10 0 NaN 1 0 17 6 no 0 11 4 no no no no no no no no 21155 9828 11327 28563 14680 13883 120381 60040 60341 29529 9083 20446 13087 6645 6442 13670 7126 6544 32063 16513 15550 3292 1450 1842 24934 12782 12152 458.0 9.0 51.0 12.0 124.0 50.0 201.0 0.0 9.0 2.0 459.0 13.0 24.0 40.0 130.0 252.0 4 1.489049 0.984537 11.565624 0.963802 0.179441 0.236455 0.078090 0.106125 0.451173 15.623710 2.017080 14.317640 4.291432 51.497190 4.0 3.816045 5.271136 4 0.131597 1.579164 1.200336 no 2.677824 14.606501 17.457198 18.309433 19.487005 2.677824 1 no 2.780449 17 1.999265 no 17.478380 83 6.734618 1 27.667863 9.522538 6.348716 1.767612 11.178333 0.583025 27.892717 0.811275 0.623484 1.950317 6.483172 7.385521 4.923843 3.549558 8.789894 2.163735 10.903161 0.622272 0.599914 0.934273 0.892674 0.236455 1.031777 1.561505 15.300449 16.990677 16.041521 5.029696 0.465820 good 17.36 0.57 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 2 0 0 0 0 0 0 0 0 0 19.25 10.35 1 11000 6 80780 12 658.33 1083.33 870.83 0 3 4 5 0 0 0 0 0 0 0 3 1 28.38 6.57 2 11000 7 89492 23 673.91 1130.43 902.17 0 5 9 8 1 0 0 1 0 0 0 9 2 32.29 5.73 2 11000 7 89492 25 660.00 1120.00 890.00 0 5 11 8 1 0 0 1 1 0 0 13 2 20.79 3.57 4 167000 12 205756 32 718.75 1218.75 968.75 0 5 14 10 3 0 0 1 2 0 0 18 3 14.18 3.89 8 244166 22 942180 61 931.58 1552.63 1242.11 4 7 21 15 11 2 1 4 4 0 0 26 3 13100000
4 5 2011-09-05 77 77.0 4.0 NaN NaN NaN NaN NaN NaN Investment Basmannoe 8.398461e+06 108171 0.015234 0.037316 5706 3240.0 7.0 6748 7770.0 9 0 562.0 4 2 25 2 no 0 10 93 no no no yes yes no no no 28179 13522 14657 13368 7159 6209 68043 34236 33807 26760 8563 18197 5706 2982 2724 6748 3664 3084 15237 8113 7124 5164 2583 2581 11631 6223 5408 746.0 48.0 0.0 0.0 643.0 16.0 35.0 0.0 3.0 1.0 746.0 371.0 114.0 146.0 62.0 53.0 5 1.257186 0.876620 8.266305 0.688859 0.247901 0.376838 0.258289 0.236214 0.392871 10.683540 2.936581 11.903910 0.853960 10.247521 5.0 1.595898 2.156284 113 0.071480 0.857764 0.820294 no 11.616653 1.721834 0.046810 0.787593 2.578671 1.721834 4 no 3.133531 10 0.084113 yes 1.595898 113 1.423428 4 6.515857 8.671016 1.638318 3.632640 4.587917 2.609420 9.155057 1.969738 0.220288 2.544696 3.975401 3.610754 0.307915 1.864637 3.779781 1.121703 0.991683 0.892668 0.429052 0.077901 0.810801 0.376838 0.378756 0.121681 2.584370 1.112486 1.800125 1.339652 0.026102 excellent 3.56 4.44 15 293699 1 45000 48 702.22 1166.67 934.44 3 17 10 11 7 0 0 1 4 0 2 3 0 3.34 8.29 46 420952 3 158200 153 763.45 1272.41 1017.93 8 39 45 39 19 2 1 7 12 0 6 7 0 4.12 4.83 93 1195735 9 445900 272 766.80 1272.73 1019.76 19 70 74 72 30 6 1 18 30 0 10 14 2 4.53 5.02 149 1625130 17 564843 483 765.93 1269.23 1017.58 28 130 129 131 50 14 1 35 61 0 17 21 3 5.06 8.62 305 3420907 60 2296870 1068 853.03 1410.45 1131.74 63 266 267 262 149 57 4 70 121 1 40 77 5 8.38 10.92 689 8404624 114 3503058 2283 853.88 1411.45 1132.66 143 566 578 552 319 108 17 135 236 2 91 195 14 16331452

In [176]:
train_new = preprocess_anomaly(train_raw)
test_new = preprocess_anomaly(test)

train_new = train_new.set_index("id")
test_new = test_new.set_index("id")

train_new = train_new.join(filled_train[important_feats], rsuffix="_filled")
test_new = test_new.join(filled_test[important_feats], rsuffix="_filled")
for impf in important_feats:
    train_new[impf] = train_new[impf].fillna(train_new["%s_filled" % impf])
    train_new = train_new.drop(["%s_filled" % impf], axis=1)
    test_new[impf] = test_new[impf].fillna(test_new["%s_filled" % impf])
    test_new = test_new.drop(["%s_filled" % impf], axis=1)

In [177]:
# train_new = feature_exclude(train_new)
# test_new = feature_exclude(test_new)

In [178]:
train_new.to_csv("data/train_cleaned.csv", encoding="utf_8")
test_new.to_csv("data/test_cleaned.csv", encoding="utf_8")

Fix from Sberbank


In [259]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()


Out[259]:
id timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km catering_km ecology green_part_500 prom_part_500 office_count_500 office_sqm_500 trc_count_500 trc_sqm_500 cafe_count_500 cafe_sum_500_min_price_avg cafe_sum_500_max_price_avg cafe_avg_price_500 cafe_count_500_na_price cafe_count_500_price_500 cafe_count_500_price_1000 cafe_count_500_price_1500 cafe_count_500_price_2500 cafe_count_500_price_4000 cafe_count_500_price_high big_church_count_500 church_count_500 mosque_count_500 leisure_count_500 sport_count_500 market_count_500 green_part_1000 prom_part_1000 office_count_1000 office_sqm_1000 trc_count_1000 trc_sqm_1000 cafe_count_1000 cafe_sum_1000_min_price_avg cafe_sum_1000_max_price_avg cafe_avg_price_1000 cafe_count_1000_na_price cafe_count_1000_price_500 cafe_count_1000_price_1000 cafe_count_1000_price_1500 cafe_count_1000_price_2500 cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
0 1 2011-08-20 43 27.0 4.0 NaN NaN NaN NaN NaN NaN Investment Bibirevo 6.407578e+06 155572 0.189727 0.000070 9576 5001.0 5.0 10309 11065.0 5 0 240.0 1 0 7 3 no 0 16 1 no no no no no no no no 86206 40477 45729 21154 11007 10147 98207 52277 45930 36211 10580 25631 9576 4899 4677 10309 5463 4846 23603 12286 11317 17508 9425 8083 18654 9709 8945 211.0 25.0 0.0 0.0 0.0 2.0 184.0 0.0 0.0 0.0 211.0 0.0 0.0 0.0 206.0 5.0 1 2.590241 1.131260 13.575119 1.131260 0.145700 0.177975 2.158587 0.600973 1.080934 23.683460 1.804127 3.633334 5.419893 65.038716 1.0 5.419893 6.905893 1 0.274985 3.299822 0.992631 no 1.422391 10.918587 13.100618 13.675657 15.156211 1.422391 1 no 3.830951 5 1.305159 no 14.231961 101 24.292406 1 18.152338 5.718519 1.210027 1.062513 5.814135 4.308127 10.814172 1.676258 0.485841 3.065047 1.107594 8.148591 3.516513 2.392353 4.248036 0.974743 6.715026 0.884350 0.648488 0.637189 0.947962 0.177975 0.625783 0.628187 3.932040 14.053047 7.389498 7.023705 0.516838 good 0.00 0.00 0 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 0 0 1 0 7.36 0.00 1 30500 3 55600 19 527.78 888.89 708.33 1 10 4 3 1 0 0 1 2 0 0 6 1 14.27 6.92 3 39554 9 171420 34 566.67 969.70 768.18 1 14 11 6 2 0 0 1 2 0 0 7 1 11.77 15.97 9 188854 19 1244891 36 614.29 1042.86 828.57 1 15 11 6 2 1 0 1 2 0 0 10 1 11.98 13.55 12 251554 23 1419204 68 639.68 1079.37 859.52 5 21 22 16 3 1 0 2 4 0 0 21 1 13.09 13.31 29 807385 52 4036616 152 708.57 1185.71 947.14 12 39 48 40 9 4 0 13 22 1 0 52 4 5850000
1 2 2011-08-23 34 19.0 3.0 NaN NaN NaN NaN NaN NaN Investment Nagatinskij Zaton 9.589337e+06 115352 0.372602 0.049637 6880 3119.0 5.0 7759 6237.0 8 0 229.0 1 0 6 1 yes 1 3 0 no no no no no no no no 76284 34200 42084 15727 7925 7802 70194 35622 34572 29431 9266 20165 6880 3466 3414 7759 3909 3850 17700 8998 8702 15164 7571 7593 13729 6929 6800 245.0 83.0 1.0 0.0 67.0 4.0 90.0 0.0 0.0 0.0 244.0 1.0 1.0 143.0 84.0 15.0 2 0.936700 0.647337 7.620630 0.635053 0.147754 0.273345 0.550690 0.065321 0.966479 1.317476 4.655004 8.648587 3.411993 40.943917 2.0 3.641773 4.679745 2 0.065263 0.783160 0.698081 no 9.503405 3.103996 6.444333 8.132640 8.698054 2.887377 2 no 3.103996 4 0.694536 no 9.242586 32 5.706113 2 9.034642 3.489954 2.724295 1.246149 3.419574 0.725560 6.910568 3.424716 0.668364 2.000154 8.972823 6.127073 1.161579 2.543747 12.649879 1.477723 1.852560 0.686252 0.519311 0.688796 1.072315 0.273345 0.967821 0.471447 4.841544 6.829889 0.709260 2.358840 0.230287 excellent 25.14 0.00 0 0 0 0 5 860.00 1500.00 1180.00 0 1 3 0 0 1 0 0 1 0 0 0 0 26.66 0.07 2 86600 5 94065 13 615.38 1076.92 846.15 0 5 6 1 0 1 0 1 2 0 4 2 0 21.53 7.71 3 102910 7 127065 17 694.12 1205.88 950.00 0 6 7 1 2 1 0 1 5 0 4 9 0 22.37 19.25 4 165510 8 179065 21 695.24 1190.48 942.86 0 7 8 3 2 1 0 1 5 0 4 11 0 18.07 27.32 12 821986 14 491565 30 631.03 1086.21 858.62 1 11 11 4 2 1 0 1 7 0 6 19 1 10.26 27.47 66 2690465 40 2034942 177 673.81 1148.81 911.31 9 49 65 36 15 3 0 15 29 1 10 66 14 6000000
2 3 2011-08-27 43 29.0 2.0 NaN NaN NaN NaN NaN NaN Investment Tekstil'shhiki 4.808270e+06 101708 0.112560 0.118537 5879 1463.0 4.0 6207 5580.0 7 0 1183.0 1 0 5 1 no 0 0 1 no no no yes no no no no 101982 46076 55906 13028 6835 6193 63388 31813 31575 25292 7609 17683 5879 3095 2784 6207 3269 2938 14884 7821 7063 19401 9045 10356 11252 5916 5336 330.0 59.0 0.0 0.0 206.0 4.0 60.0 0.0 1.0 0.0 330.0 1.0 0.0 246.0 63.0 20.0 3 2.120999 1.637996 17.351515 1.445960 0.049102 0.158072 0.374848 0.453172 0.939275 4.912660 3.381083 11.996480 1.277658 15.331896 3.0 1.277658 1.701420 3 0.328756 3.945073 0.468265 no 5.604800 2.927487 6.963403 8.054252 9.067885 0.647250 3 no 2.927487 4 0.700691 no 9.540544 5 6.710302 3 5.777394 7.506612 0.772216 1.602183 3.682455 3.562188 5.752368 1.375443 0.733101 1.239304 1.978517 0.767569 1.952771 0.621357 7.682303 0.097144 0.841254 1.510089 1.486533 1.543049 0.391957 0.158072 3.178751 0.755946 7.922152 4.273200 3.156423 4.958214 0.190462 poor 1.67 0.00 0 0 0 0 3 666.67 1166.67 916.67 0 0 2 1 0 0 0 0 0 0 0 0 0 4.99 0.29 0 0 0 0 9 642.86 1142.86 892.86 2 0 5 2 0 0 0 0 1 0 0 5 3 9.92 6.73 0 0 1 2600 14 516.67 916.67 716.67 2 4 6 2 0 0 0 0 4 0 0 6 5 12.99 12.75 4 100200 7 52550 24 563.64 977.27 770.45 2 8 9 4 1 0 0 0 4 0 0 8 5 12.14 26.46 8 110856 7 52550 41 697.44 1192.31 944.87 2 9 17 9 3 1 0 0 11 0 0 20 6 13.69 21.58 43 1478160 35 1572990 122 702.68 1196.43 949.55 10 29 45 25 10 3 0 11 27 0 4 67 10 5700000
3 4 2011-09-01 89 50.0 9.0 NaN NaN NaN NaN NaN NaN Investment Mitino 1.258354e+07 178473 0.194703 0.069753 13087 6839.0 9.0 13670 17063.0 10 0 NaN 1 0 17 6 no 0 11 4 no no no no no no no no 21155 9828 11327 28563 14680 13883 120381 60040 60341 29529 9083 20446 13087 6645 6442 13670 7126 6544 32063 16513 15550 3292 1450 1842 24934 12782 12152 458.0 9.0 51.0 12.0 124.0 50.0 201.0 0.0 9.0 2.0 459.0 13.0 24.0 40.0 130.0 252.0 4 1.489049 0.984537 11.565624 0.963802 0.179441 0.236455 0.078090 0.106125 0.451173 15.623710 2.017080 14.317640 4.291432 51.497190 4.0 3.816045 5.271136 4 0.131597 1.579164 1.200336 no 2.677824 14.606501 17.457198 18.309433 19.487005 2.677824 1 no 2.780449 17 1.999265 no 17.478380 83 6.734618 1 27.667863 9.522538 6.348716 1.767612 11.178333 0.583025 27.892717 0.811275 0.623484 1.950317 6.483172 7.385521 4.923843 3.549558 8.789894 2.163735 10.903161 0.622272 0.599914 0.934273 0.892674 0.236455 1.031777 1.561505 15.300449 16.990677 16.041521 5.029696 0.465820 good 17.36 0.57 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 2 0 0 0 0 0 0 0 0 0 19.25 10.35 1 11000 6 80780 12 658.33 1083.33 870.83 0 3 4 5 0 0 0 0 0 0 0 3 1 28.38 6.57 2 11000 7 89492 23 673.91 1130.43 902.17 0 5 9 8 1 0 0 1 0 0 0 9 2 32.29 5.73 2 11000 7 89492 25 660.00 1120.00 890.00 0 5 11 8 1 0 0 1 1 0 0 13 2 20.79 3.57 4 167000 12 205756 32 718.75 1218.75 968.75 0 5 14 10 3 0 0 1 2 0 0 18 3 14.18 3.89 8 244166 22 942180 61 931.58 1552.63 1242.11 4 7 21 15 11 2 1 4 4 0 0 26 3 13100000
4 5 2011-09-05 77 77.0 4.0 NaN NaN NaN NaN NaN NaN Investment Basmannoe 8.398461e+06 108171 0.015234 0.037316 5706 3240.0 7.0 6748 7770.0 9 0 562.0 4 2 25 2 no 0 10 93 no no no yes yes no no no 28179 13522 14657 13368 7159 6209 68043 34236 33807 26760 8563 18197 5706 2982 2724 6748 3664 3084 15237 8113 7124 5164 2583 2581 11631 6223 5408 746.0 48.0 0.0 0.0 643.0 16.0 35.0 0.0 3.0 1.0 746.0 371.0 114.0 146.0 62.0 53.0 5 1.257186 0.876620 8.266305 0.688859 0.247901 0.376838 0.258289 0.236214 0.392871 10.683540 2.936581 11.903910 0.853960 10.247521 5.0 1.595898 2.156284 113 0.071480 0.857764 0.820294 no 11.616653 1.721834 0.046810 0.787593 2.578671 1.721834 4 no 3.133531 10 0.084113 yes 1.595898 113 1.423428 4 6.515857 8.671016 1.638318 3.632640 4.587917 2.609420 9.155057 1.969738 0.220288 2.544696 3.975401 3.610754 0.307915 1.864637 3.779781 1.121703 0.991683 0.892668 0.429052 0.077901 0.810801 0.376838 0.378756 0.121681 2.584370 1.112486 1.800125 1.339652 0.026102 excellent 3.56 4.44 15 293699 1 45000 48 702.22 1166.67 934.44 3 17 10 11 7 0 0 1 4 0 2 3 0 3.34 8.29 46 420952 3 158200 153 763.45 1272.41 1017.93 8 39 45 39 19 2 1 7 12 0 6 7 0 4.12 4.83 93 1195735 9 445900 272 766.80 1272.73 1019.76 19 70 74 72 30 6 1 18 30 0 10 14 2 4.53 5.02 149 1625130 17 564843 483 765.93 1269.23 1017.58 28 130 129 131 50 14 1 35 61 0 17 21 3 5.06 8.62 305 3420907 60 2296870 1068 853.03 1410.45 1131.74 63 266 267 262 149 57 4 70 121 1 40 77 5 8.38 10.92 689 8404624 114 3503058 2283 853.88 1411.45 1132.66 143 566 578 552 319 108 17 135 236 2 91 195 14 16331452

In [260]:
def update(source, patch):
    dtypes = source.dtypes
    source.update(patch, overwrite=True)
    for c, t in dtypes.iteritems():
        source[c] = source[c].astype(t)
    return source

In [261]:
train_raw.set_index("id")
test.set_index("id")
fx = pd.read_excel('data/BAD_ADDRESS_FIX.xlsx').drop_duplicates('id').set_index('id')

train_raw = update(train_raw, fx)
test = update(test, fx)

train_raw.reset_index()
test.reset_index()
print('Fix in train: ', train_raw.index.intersection(fx.index).shape[0])
print('Fix in test : ', test.index.intersection(fx.index).shape[0])

train_raw.to_csv("data/train_fix.csv", index=False, encoding="utf-8")
test.to_csv("data/test_fix.csv", index=False, encoding="utf-8")


('Fix in train: ', 500)
('Fix in test : ', 115)

Auto ML


In [266]:
from auto_ml import Predictor

In [267]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()


Out[267]:
id timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km catering_km ecology green_part_500 prom_part_500 office_count_500 office_sqm_500 trc_count_500 trc_sqm_500 cafe_count_500 cafe_sum_500_min_price_avg cafe_sum_500_max_price_avg cafe_avg_price_500 cafe_count_500_na_price cafe_count_500_price_500 cafe_count_500_price_1000 cafe_count_500_price_1500 cafe_count_500_price_2500 cafe_count_500_price_4000 cafe_count_500_price_high big_church_count_500 church_count_500 mosque_count_500 leisure_count_500 sport_count_500 market_count_500 green_part_1000 prom_part_1000 office_count_1000 office_sqm_1000 trc_count_1000 trc_sqm_1000 cafe_count_1000 cafe_sum_1000_min_price_avg cafe_sum_1000_max_price_avg cafe_avg_price_1000 cafe_count_1000_na_price cafe_count_1000_price_500 cafe_count_1000_price_1000 cafe_count_1000_price_1500 cafe_count_1000_price_2500 cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
0 1 2011-08-20 43 27.0 4.0 NaN NaN NaN NaN NaN NaN Investment Bibirevo 6.407578e+06 155572 0.189727 0.000070 9576 5001.0 5.0 10309 11065.0 5 0 240.0 1 0 7 3 no 0 16 1 no no no no no no no no 86206 40477 45729 21154 11007 10147 98207 52277 45930 36211 10580 25631 9576 4899 4677 10309 5463 4846 23603 12286 11317 17508 9425 8083 18654 9709 8945 211.0 25.0 0.0 0.0 0.0 2.0 184.0 0.0 0.0 0.0 211.0 0.0 0.0 0.0 206.0 5.0 1 2.590241 1.131260 13.575119 1.131260 0.145700 0.177975 2.158587 0.600973 1.080934 23.683460 1.804127 3.633334 5.419893 65.038716 1.0 5.419893 6.905893 1 0.274985 3.299822 0.992631 no 1.422391 10.918587 13.100618 13.675657 15.156211 1.422391 1 no 3.830951 5 1.305159 no 14.231961 101 24.292406 1 18.152338 5.718519 1.210027 1.062513 5.814135 4.308127 10.814172 1.676258 0.485841 3.065047 1.107594 8.148591 3.516513 2.392353 4.248036 0.974743 6.715026 0.884350 0.648488 0.637189 0.947962 0.177975 0.625783 0.628187 3.932040 14.053047 7.389498 7.023705 0.516838 good 0.00 0.00 0 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 0 0 1 0 7.36 0.00 1 30500 3 55600 19 527.78 888.89 708.33 1 10 4 3 1 0 0 1 2 0 0 6 1 14.27 6.92 3 39554 9 171420 34 566.67 969.70 768.18 1 14 11 6 2 0 0 1 2 0 0 7 1 11.77 15.97 9 188854 19 1244891 36 614.29 1042.86 828.57 1 15 11 6 2 1 0 1 2 0 0 10 1 11.98 13.55 12 251554 23 1419204 68 639.68 1079.37 859.52 5 21 22 16 3 1 0 2 4 0 0 21 1 13.09 13.31 29 807385 52 4036616 152 708.57 1185.71 947.14 12 39 48 40 9 4 0 13 22 1 0 52 4 5850000
1 2 2011-08-23 34 19.0 3.0 NaN NaN NaN NaN NaN NaN Investment Nagatinskij Zaton 9.589337e+06 115352 0.372602 0.049637 6880 3119.0 5.0 7759 6237.0 8 0 229.0 1 0 6 1 yes 1 3 0 no no no no no no no no 76284 34200 42084 15727 7925 7802 70194 35622 34572 29431 9266 20165 6880 3466 3414 7759 3909 3850 17700 8998 8702 15164 7571 7593 13729 6929 6800 245.0 83.0 1.0 0.0 67.0 4.0 90.0 0.0 0.0 0.0 244.0 1.0 1.0 143.0 84.0 15.0 2 0.936700 0.647337 7.620630 0.635053 0.147754 0.273345 0.550690 0.065321 0.966479 1.317476 4.655004 8.648587 3.411993 40.943917 2.0 3.641773 4.679745 2 0.065263 0.783160 0.698081 no 9.503405 3.103996 6.444333 8.132640 8.698054 2.887377 2 no 3.103996 4 0.694536 no 9.242586 32 5.706113 2 9.034642 3.489954 2.724295 1.246149 3.419574 0.725560 6.910568 3.424716 0.668364 2.000154 8.972823 6.127073 1.161579 2.543747 12.649879 1.477723 1.852560 0.686252 0.519311 0.688796 1.072315 0.273345 0.967821 0.471447 4.841544 6.829889 0.709260 2.358840 0.230287 excellent 25.14 0.00 0 0 0 0 5 860.00 1500.00 1180.00 0 1 3 0 0 1 0 0 1 0 0 0 0 26.66 0.07 2 86600 5 94065 13 615.38 1076.92 846.15 0 5 6 1 0 1 0 1 2 0 4 2 0 21.53 7.71 3 102910 7 127065 17 694.12 1205.88 950.00 0 6 7 1 2 1 0 1 5 0 4 9 0 22.37 19.25 4 165510 8 179065 21 695.24 1190.48 942.86 0 7 8 3 2 1 0 1 5 0 4 11 0 18.07 27.32 12 821986 14 491565 30 631.03 1086.21 858.62 1 11 11 4 2 1 0 1 7 0 6 19 1 10.26 27.47 66 2690465 40 2034942 177 673.81 1148.81 911.31 9 49 65 36 15 3 0 15 29 1 10 66 14 6000000
2 3 2011-08-27 43 29.0 2.0 NaN NaN NaN NaN NaN NaN Investment Tekstil'shhiki 4.808270e+06 101708 0.112560 0.118537 5879 1463.0 4.0 6207 5580.0 7 0 1183.0 1 0 5 1 no 0 0 1 no no no yes no no no no 101982 46076 55906 13028 6835 6193 63388 31813 31575 25292 7609 17683 5879 3095 2784 6207 3269 2938 14884 7821 7063 19401 9045 10356 11252 5916 5336 330.0 59.0 0.0 0.0 206.0 4.0 60.0 0.0 1.0 0.0 330.0 1.0 0.0 246.0 63.0 20.0 3 2.120999 1.637996 17.351515 1.445960 0.049102 0.158072 0.374848 0.453172 0.939275 4.912660 3.381083 11.996480 1.277658 15.331896 3.0 1.277658 1.701420 3 0.328756 3.945073 0.468265 no 5.604800 2.927487 6.963403 8.054252 9.067885 0.647250 3 no 2.927487 4 0.700691 no 9.540544 5 6.710302 3 5.777394 7.506612 0.772216 1.602183 3.682455 3.562188 5.752368 1.375443 0.733101 1.239304 1.978517 0.767569 1.952771 0.621357 7.682303 0.097144 0.841254 1.510089 1.486533 1.543049 0.391957 0.158072 3.178751 0.755946 7.922152 4.273200 3.156423 4.958214 0.190462 poor 1.67 0.00 0 0 0 0 3 666.67 1166.67 916.67 0 0 2 1 0 0 0 0 0 0 0 0 0 4.99 0.29 0 0 0 0 9 642.86 1142.86 892.86 2 0 5 2 0 0 0 0 1 0 0 5 3 9.92 6.73 0 0 1 2600 14 516.67 916.67 716.67 2 4 6 2 0 0 0 0 4 0 0 6 5 12.99 12.75 4 100200 7 52550 24 563.64 977.27 770.45 2 8 9 4 1 0 0 0 4 0 0 8 5 12.14 26.46 8 110856 7 52550 41 697.44 1192.31 944.87 2 9 17 9 3 1 0 0 11 0 0 20 6 13.69 21.58 43 1478160 35 1572990 122 702.68 1196.43 949.55 10 29 45 25 10 3 0 11 27 0 4 67 10 5700000
3 4 2011-09-01 89 50.0 9.0 NaN NaN NaN NaN NaN NaN Investment Mitino 1.258354e+07 178473 0.194703 0.069753 13087 6839.0 9.0 13670 17063.0 10 0 NaN 1 0 17 6 no 0 11 4 no no no no no no no no 21155 9828 11327 28563 14680 13883 120381 60040 60341 29529 9083 20446 13087 6645 6442 13670 7126 6544 32063 16513 15550 3292 1450 1842 24934 12782 12152 458.0 9.0 51.0 12.0 124.0 50.0 201.0 0.0 9.0 2.0 459.0 13.0 24.0 40.0 130.0 252.0 4 1.489049 0.984537 11.565624 0.963802 0.179441 0.236455 0.078090 0.106125 0.451173 15.623710 2.017080 14.317640 4.291432 51.497190 4.0 3.816045 5.271136 4 0.131597 1.579164 1.200336 no 2.677824 14.606501 17.457198 18.309433 19.487005 2.677824 1 no 2.780449 17 1.999265 no 17.478380 83 6.734618 1 27.667863 9.522538 6.348716 1.767612 11.178333 0.583025 27.892717 0.811275 0.623484 1.950317 6.483172 7.385521 4.923843 3.549558 8.789894 2.163735 10.903161 0.622272 0.599914 0.934273 0.892674 0.236455 1.031777 1.561505 15.300449 16.990677 16.041521 5.029696 0.465820 good 17.36 0.57 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 2 0 0 0 0 0 0 0 0 0 19.25 10.35 1 11000 6 80780 12 658.33 1083.33 870.83 0 3 4 5 0 0 0 0 0 0 0 3 1 28.38 6.57 2 11000 7 89492 23 673.91 1130.43 902.17 0 5 9 8 1 0 0 1 0 0 0 9 2 32.29 5.73 2 11000 7 89492 25 660.00 1120.00 890.00 0 5 11 8 1 0 0 1 1 0 0 13 2 20.79 3.57 4 167000 12 205756 32 718.75 1218.75 968.75 0 5 14 10 3 0 0 1 2 0 0 18 3 14.18 3.89 8 244166 22 942180 61 931.58 1552.63 1242.11 4 7 21 15 11 2 1 4 4 0 0 26 3 13100000
4 5 2011-09-05 77 77.0 4.0 NaN NaN NaN NaN NaN NaN Investment Basmannoe 8.398461e+06 108171 0.015234 0.037316 5706 3240.0 7.0 6748 7770.0 9 0 562.0 4 2 25 2 no 0 10 93 no no no yes yes no no no 28179 13522 14657 13368 7159 6209 68043 34236 33807 26760 8563 18197 5706 2982 2724 6748 3664 3084 15237 8113 7124 5164 2583 2581 11631 6223 5408 746.0 48.0 0.0 0.0 643.0 16.0 35.0 0.0 3.0 1.0 746.0 371.0 114.0 146.0 62.0 53.0 5 1.257186 0.876620 8.266305 0.688859 0.247901 0.376838 0.258289 0.236214 0.392871 10.683540 2.936581 11.903910 0.853960 10.247521 5.0 1.595898 2.156284 113 0.071480 0.857764 0.820294 no 11.616653 1.721834 0.046810 0.787593 2.578671 1.721834 4 no 3.133531 10 0.084113 yes 1.595898 113 1.423428 4 6.515857 8.671016 1.638318 3.632640 4.587917 2.609420 9.155057 1.969738 0.220288 2.544696 3.975401 3.610754 0.307915 1.864637 3.779781 1.121703 0.991683 0.892668 0.429052 0.077901 0.810801 0.376838 0.378756 0.121681 2.584370 1.112486 1.800125 1.339652 0.026102 excellent 3.56 4.44 15 293699 1 45000 48 702.22 1166.67 934.44 3 17 10 11 7 0 0 1 4 0 2 3 0 3.34 8.29 46 420952 3 158200 153 763.45 1272.41 1017.93 8 39 45 39 19 2 1 7 12 0 6 7 0 4.12 4.83 93 1195735 9 445900 272 766.80 1272.73 1019.76 19 70 74 72 30 6 1 18 30 0 10 14 2 4.53 5.02 149 1625130 17 564843 483 765.93 1269.23 1017.58 28 130 129 131 50 14 1 35 61 0 17 21 3 5.06 8.62 305 3420907 60 2296870 1068 853.03 1410.45 1131.74 63 266 267 262 149 57 4 70 121 1 40 77 5 8.38 10.92 689 8404624 114 3503058 2283 853.88 1411.45 1132.66 143 566 578 552 319 108 17 135 236 2 91 195 14 16331452

In [268]:
train_pr = preprocess(train_raw)
train_pr = preprocess_categorial(train_pr)
train = feature_exclude(train_pr)

In [ ]:
# Tell auto_ml which column is 'output'
# Also note columns that aren't purely numerical
# Examples include ['nlp', 'date', 'categorical', 'ignore']
column_descriptions = {
  'price_doc': 'output'
}

ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

ml_predictor.train(train)

file_name = ml_predictor.save()
print file_name

# Score the model on test data
test_score = ml_predictor.score(df_test, df_test.MEDV)

Смотрим на данные


In [173]:
#Checking for missing data
NAs = pd.concat([
    train.isnull().sum(), 
    test_pr.isnull().sum()
], axis=1, keys=['Train', 'Test'])
NAs[NAs.sum(axis=1) > 0]


Out[173]:
Train Test
ID_big_road1_sll 3.0 NaN
ID_big_road2_sll 1.0 NaN
ID_metro_sll 15.0 NaN
ID_railroad_station_avto_sll 13.0 NaN
ID_railroad_station_walk_sll 11.0 NaN
age_of_building 3142.0 1049.0
build_count_1921-1945 2611.0 1218.0
build_count_1971-1995 2611.0 1218.0
build_count_after_1995 2611.0 1218.0
build_count_before_1920 2611.0 1218.0
build_count_block 2611.0 1218.0
build_count_brick 2611.0 1218.0
build_count_frame 2611.0 1218.0
build_count_mix 2611.0 1218.0
build_count_monolith 2611.0 1218.0
build_count_panel 2611.0 1218.0
build_count_slag 2611.0 1218.0
build_count_wood 2611.0 1218.0
build_year NaN 1606.0
build_year_ten 3807.0 NaN
cafe_avg_price_1000 3337.0 1222.0
cafe_avg_price_1500 2165.0 821.0
cafe_avg_price_2000 935.0 424.0
cafe_avg_price_3000 692.0 182.0
cafe_avg_price_500 7599.0 3159.0
cafe_sum_1000_max_price_avg 3337.0 1222.0
cafe_sum_1000_min_price_avg 3337.0 1222.0
cafe_sum_1500_max_price_avg 2165.0 821.0
cafe_sum_1500_min_price_avg 2165.0 821.0
cafe_sum_2000_max_price_avg 935.0 424.0
cafe_sum_2000_min_price_avg 935.0 424.0
cafe_sum_3000_max_price_avg 692.0 182.0
cafe_sum_3000_min_price_avg 692.0 182.0
cafe_sum_5000_min_price_avg 227.0 128.0
cafe_sum_500_max_price_avg 7599.0 3159.0
cafe_sum_500_min_price_avg 7599.0 3159.0
floor 7.0 0.0
full_sq 18.0 4.0
green_part_2000 0.0 19.0
hospital_beds_raion 8470.0 3418.0
kitch_sq 4891.0 2163.0
life_sq 3484.0 1516.0
max_floor 4.0 643.0
metro_km_walk 24.0 34.0
num_room 6.0 0.0
preschool_quota 3690.0 1596.0
prom_part_5000 147.0 92.0
railroad_station_walk_min 24.0 34.0
raion_build_count_with_material_info 2611.0 1218.0
ratio_kitch_sq_full_sq 4891.0 2163.0
ratio_kitch_sq_life_sq 5146.0 2214.0
ratio_life_sq_full_sq 3484.0 1519.0
rel_floor 7.0 643.0
rel_kitch_sq 4891.0 2163.0
rel_life_sq 3484.0 1519.0
school_quota 3687.0 1595.0
state 3148.0 694.0
sub_area_sll 2.0 NaN

In [ ]: