In [1]:

    
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline
%load_ext ipycache

import pandas as pd
pd.options.display.max_rows = 300
pd.options.display.max_columns = 300
import lightgbm as lgb
import numpy as np
import scipy
import sklearn as sk
import xgboost as xgb

from eli5 import show_weights

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt









    



/Users/evgeny/Library/Python/2.7/lib/python/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated since IPython 4.0. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [2]:

    
# Вспомогательные функции
import math
from sklearn.metrics import make_scorer

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [
        (math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 
        for i,pred in enumerate(y_pred)
    ]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

def rmse(y, y_pred):
    return np.sqrt(((y_pred - y) ** 2).mean())

def feat_imp(model):
    return pd.DataFrame(
        model.get_fscore().items(), 
        columns=['feature','importance']
    ).sort_values('importance', ascending=False)

def unlog(y):
    return np.expm1(y)

rmse_scoring = make_scorer(rmse, greater_is_better=False)
rmsle_scoring = make_scorer(rmsle, greater_is_better=False)

Препроцессинг фич



In [3]:

    
def align_to_lb_score(df):
    # https://www.kaggle.com/c/sberbank-russian-housing-market/discussion/32717
    df = df.copy()
    trainsub = df[df.timestamp < '2015-01-01']
    trainsub = trainsub[trainsub.product_type=="Investment"]

    ind_1m = trainsub[trainsub.price_doc <= 1000000].index
    ind_2m = trainsub[trainsub.price_doc == 2000000].index
    ind_3m = trainsub[trainsub.price_doc == 3000000].index

    train_index = set(df.index.copy())

    for ind, gap in zip([ind_1m, ind_2m, ind_3m], [10, 3, 2]):
        ind_set = set(ind)
        ind_set_cut = ind.difference(set(ind[::gap]))

        train_index = train_index.difference(ind_set_cut)

    df = df.loc[train_index]
    df["price_doc"] = np.log1p(df["price_doc"].values)
    return df

def preprocess_anomaly(df):
    
    # удаляем из обучающей выборки все нулевые данные. В test данные все заполнены
    df = df.dropna(subset=["preschool_education_centers_raion", "num_room", 
                           "max_floor", "material", "kitch_sq", "floor"])

    df["product_type"].fillna("Investment", inplace=True)

    df["full_sq"] = map(lambda x: x if x > 10 else float("NaN"), df["full_sq"])
    df["life_sq"] = map(lambda x: x if x > 5 else float("NaN"), df["life_sq"])
    df["kitch_sq"] = map(lambda x: x if x > 2 else float("NaN"), df["kitch_sq"])
    
    # superclean
    # https://www.kaggle.com/keremt/very-extensive-cleaning-by-sberbank-discussions
    df.ix[df[df.life_sq > df.full_sq].index, "life_sq"] = np.NaN
    df.ix[df[df.kitch_sq >= df.life_sq].index, "kitch_sq"] = np.NaN

    df.ix[df[df.kitch_sq == 0].index, "kitch_sq"] = np.NaN
    df.ix[df[df.kitch_sq == 1].index, "kitch_sq"] = np.NaN

    df.ix[df[df.num_room == 0].index, "num_room"] = np.NaN
    
    df.ix[df[df.floor == 0].index, "floor"] = np.NaN
    df.ix[df[df.max_floor == 0].index, "max_floor"] = np.NaN
    
    df.ix[df[df.floor > df.max_floor].index, "max_floor"] = np.NaN
    
    df.ix[df[df.state == 33].index, "state"] = np.NaN
    
    df.ix[df[df.build_year == 20052009].index, "build_year"] = 2005
    df.ix[df[df.build_year == 20].index, "build_year"] = 2000
    df.ix[df[df.build_year == 215].index, "build_year"] = 2015

    df.ix[df[df.build_year < 1500].index, "build_year"] = np.NaN
    df.ix[df[df.build_year > 2022].index, "build_year"] = np.NaN
    
    return df



In [4]:

    
def smoothed_likelihood(targ_mean, nrows, globalmean, alpha=10):
    try:
        return (targ_mean * nrows + globalmean * alpha) / (nrows + alpha)
    except Exception:
        return float("NaN")

def mess_y_categorial(df, nfolds=3, alpha=10):
    from copy import copy

    folds = np.array_split(df, nfolds)
    newfolds = []
    for i in range(nfolds):
        fold = folds[i]

        other_folds = copy(folds)
        other_folds.pop(i)
        other_fold = pd.concat(other_folds)

        newfolds.append(mess_y_categorial_fold(fold, other_fold, alpha=10))

    return pd.concat(newfolds)

def mess_y_categorial_fold(fold_raw, other_fold, cols=None, y_col="price_doc", alpha=10):
    fold = fold_raw.copy()
    if not cols:
        cols = list(fold.select_dtypes(include=["object"]).columns)
    globalmean = other_fold[y_col].mean()
    for c in cols:

        target_mean = other_fold[[c, y_col]].fillna("").groupby(c).mean().to_dict()[y_col]
        nrows = other_fold[c].fillna("").value_counts().to_dict()

        fold[c + "_sll"] = fold[c].fillna("").apply(
            lambda x: smoothed_likelihood(target_mean.get(x), nrows.get(x), globalmean, alpha)
        )
    return fold

def feature_exclude(df):
    # Убираем build_year, вместо него остается age_of_building
    # Вероятно из-за build_year переобучение
    feats = ["build_year", "build_year_cat_le"]

    with open("greedy_search.tsv") as gs:
        for line in gs:
            row = line.strip().split("\t")
            if len(row) < 6:
                continue
            if row[5] == "remove":
                feats.append(row[0])

    df = df.drop(feats, axis=1, errors="ignore")

    return df



In [5]:

    
ALPHA = 50

lbl = sk.preprocessing.LabelEncoder()

def preprocess_categorial(df):

    for c in list(df.columns):
        if df[c].dtype == 'object':
            try:
                try:
                    lbl.fit(list(train_raw[c].values) + list(test[c].values) + list(df[c].values)) 
                except KeyError as e:
                    lbl.fit(df[c].values) 
                df[c + "_le"] = lbl.transform(list(df[c].values))
            except ValueError as e:
                print c, e
                raise

    df = mess_y_categorial(df, 5, alpha=ALPHA)

    df = df.select_dtypes(exclude=['object'])
    return df

def apply_categorial(test, train):
    for c in list(test.columns):
        if test[c].dtype == 'object':
            try:
                lbl.fit(list(train_raw[c].values) + list(test[c].values) + list(train[c].values)) 
            except KeyError:
                lbl.fit(test[c].values) 
            test[c + "_le"] = lbl.transform(list(test[c].values))

    test = mess_y_categorial_fold(test, train, alpha=ALPHA)

    test = test.select_dtypes(exclude=['object'])
    return test

def apply_macro(df):
    macro_cols = [
        'timestamp', "balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
        "micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
        "income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"
    ]
    return df.merge(macro[macro_cols], on='timestamp', how='left')



In [6]:

    
def preprocess(df):
    df = df.copy()

    ecology = ["no data", "poor", "satisfactory", "good", "excellent"]
    df["ecology_index"] = map(ecology.index, df["ecology"].values)

    df["age_of_building"] = df["timestamp"].apply(lambda x: x.split("-")[0]).astype(int) - df["build_year"]
    df["is_build_in_progress"] = df["age_of_building"].apply(lambda x: "yes" if x < 0 else "no")

    bool_feats = [
        "thermal_power_plant_raion",
        "incineration_raion",
        "oil_chemistry_raion",
        "radiation_raion",
        "railroad_terminal_raion",
        "big_market_raion",
        "nuclear_reactor_raion",
        "detention_facility_raion",
        "water_1line",
        "big_road1_1line",
        "railroad_1line",
        "culture_objects_top_25"
    ]
    for bf in bool_feats:
        try:
            df[bf + "_bool"] = map(lambda x: x == "yes", df[bf].values)
        except:
            pass

    df = preprocess_anomaly(df)

    df['rel_floor'] = df['floor'] / df['max_floor'].astype(float)
    df['rel_kitch_sq'] = df['kitch_sq'] / df['full_sq'].astype(float)
    df['rel_life_sq'] = df['life_sq'] / df['full_sq'].astype(float)

    df["material_cat"] = df.material.fillna(0).astype(int).astype(str).replace("0", "")
    df["state_cat"] = df.state.fillna(0).astype(int).astype(str).replace("0", "")
#     df["num_room_cat"] = df.num_room.fillna(0).astype(int).astype(str).replace("0", "")
#     df["build_year_cat"] = df.build_year.fillna(0).astype(int).astype(str).replace("0", "")
    df["build_year_ten"] = (df.build_year / 10).round()

    df["ID_metro"] = df.ID_metro.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_railroad_station_walk"] = df.ID_railroad_station_walk.replace("", "-10").fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_railroad_station_avto"] = df.ID_railroad_station_avto.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_big_road1"] = df.ID_big_road1.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_big_road2"] = df.ID_big_road2.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_bus_terminal"] = df.ID_bus_terminal.fillna(-10).astype(int).astype(str).replace("-10", "")

    #    # ratio of living area to full area #
    df["ratio_life_sq_full_sq"] = df["life_sq"] / np.maximum(df["full_sq"].astype("float"),1)
    df["ratio_life_sq_full_sq"].ix[df["ratio_life_sq_full_sq"]<0] = 0
    df["ratio_life_sq_full_sq"].ix[df["ratio_life_sq_full_sq"]>1] = 1

    #     # ratio of kitchen area to living area #
    df["ratio_kitch_sq_life_sq"] = df["kitch_sq"] / np.maximum(df["life_sq"].astype("float"),1)
    df["ratio_kitch_sq_life_sq"].ix[df["ratio_kitch_sq_life_sq"]<0] = 0
    df["ratio_kitch_sq_life_sq"].ix[df["ratio_kitch_sq_life_sq"]>1] = 1

    #     # ratio of kitchen area to full area #
    df["ratio_kitch_sq_full_sq"] = df["kitch_sq"] / np.maximum(df["full_sq"].astype("float"),1)
    df["ratio_kitch_sq_full_sq"].ix[df["ratio_kitch_sq_full_sq"]<0] = 0
    df["ratio_kitch_sq_full_sq"].ix[df["ratio_kitch_sq_full_sq"]>1] = 1
    df = df.drop(["timestamp"], axis=1, errors="ignore")

    return df



In [7]:

    
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv", index_col="id")
test = pd.read_csv("data/test.csv", index_col="id")
macro = pd.read_csv("data/macro.csv")



In [8]:

    
train_pr = align_to_lb_score(train_raw)
train_pr = preprocess(train_pr)
train_pr = preprocess_categorial(train_pr)
train = feature_exclude(train_pr)
train.head()









    



/Users/evgeny/Library/Python/2.7/lib/python/site-packages/pandas/core/generic.py:3549: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:32: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/pandas/core/indexing.py:517: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/pandas/core/indexing.py:179: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)






    Out[8]:







  
    
      
      full_sq
      life_sq
      floor
      max_floor
      material
      num_room
      kitch_sq
      state
      area_m
      raion_popul
      green_zone_part
      indust_part
      children_preschool
      preschool_quota
      preschool_education_centers_raion
      school_quota
      school_education_centers_top_20_raion
      hospital_beds_raion
      healthcare_centers_raion
      university_top_20_raion
      sport_objects_raion
      culture_objects_top_25_raion
      shopping_centers_raion
      office_raion
      full_all
      male_f
      female_f
      young_all
      work_all
      ekder_all
      ekder_male
      ekder_female
      0_6_all
      0_6_male
      7_14_all
      7_14_male
      7_14_female
      0_17_all
      0_17_female
      16_29_all
      16_29_male
      16_29_female
      0_13_all
      0_13_male
      raion_build_count_with_material_info
      build_count_block
      build_count_wood
      build_count_frame
      build_count_brick
      build_count_monolith
      build_count_panel
      build_count_slag
      build_count_mix
      build_count_before_1920
      build_count_1921-1945
      build_count_1971-1995
      build_count_after_1995
      metro_min_avto
      metro_km_avto
      metro_km_walk
      kindergarten_km
      school_km
      park_km
      green_zone_km
      industrial_km
      cemetery_km
      incineration_km
      railroad_station_walk_min
      railroad_station_avto_km
      railroad_station_avto_min
      public_transport_station_min_walk
      water_km
      mkad_km
      ttk_km
      sadovoe_km
      bulvar_ring_km
      kremlin_km
      big_road1_km
      big_road2_km
      railroad_km
      zd_vokzaly_avto_km
      ID_railroad_terminal
      oil_chemistry_km
      nuclear_reactor_km
      radiation_km
      power_transmission_line_km
      thermal_power_plant_km
      ts_km
      big_market_km
      market_shop_km
      fitness_km
      swim_pool_km
      ice_rink_km
      stadium_km
      basketball_km
      hospice_morgue_km
      detention_facility_km
      public_healthcare_km
      university_km
      workplaces_km
      shopping_centers_km
      office_km
      additional_education_km
      preschool_km
      big_church_km
      church_synagogue_km
      mosque_km
      theater_km
      museum_km
      exhibition_km
      catering_km
      green_part_500
      prom_part_500
      office_count_500
      trc_count_500
      trc_sqm_500
      cafe_count_500
      cafe_sum_500_min_price_avg
      cafe_sum_500_max_price_avg
      cafe_avg_price_500
      cafe_count_500_na_price
      cafe_count_500_price_500
      cafe_count_500_price_1000
      cafe_count_500_price_high
      big_church_count_500
      church_count_500
      mosque_count_500
      leisure_count_500
      sport_count_500
      green_part_1000
      prom_part_1000
      office_count_1000
      office_sqm_1000
      trc_count_1000
      trc_sqm_1000
      cafe_count_1000
      cafe_sum_1000_min_price_avg
      cafe_sum_1000_max_price_avg
      cafe_avg_price_1000
      cafe_count_1000_na_price
      cafe_count_1000_price_500
      cafe_count_1000_price_1000
      cafe_count_1000_price_1500
      cafe_count_1000_price_2500
      cafe_count_1000_price_high
      big_church_count_1000
      church_count_1000
      mosque_count_1000
      leisure_count_1000
      sport_count_1000
      market_count_1000
      green_part_1500
      prom_part_1500
      office_count_1500
      office_sqm_1500
      trc_count_1500
      trc_sqm_1500
      cafe_count_1500
      cafe_sum_1500_min_price_avg
      cafe_sum_1500_max_price_avg
      cafe_avg_price_1500
      cafe_count_1500_na_price
      cafe_count_1500_price_500
      cafe_count_1500_price_1000
      cafe_count_1500_price_1500
      cafe_count_1500_price_2500
      cafe_count_1500_price_4000
      cafe_count_1500_price_high
      big_church_count_1500
      mosque_count_1500
      leisure_count_1500
      sport_count_1500
      market_count_1500
      green_part_2000
      prom_part_2000
      office_count_2000
      office_sqm_2000
      trc_count_2000
      trc_sqm_2000
      cafe_count_2000
      cafe_sum_2000_min_price_avg
      cafe_sum_2000_max_price_avg
      cafe_avg_price_2000
      cafe_count_2000_na_price
      cafe_count_2000_price_500
      cafe_count_2000_price_1000
      cafe_count_2000_price_1500
      cafe_count_2000_price_2500
      cafe_count_2000_price_4000
      cafe_count_2000_price_high
      big_church_count_2000
      church_count_2000
      mosque_count_2000
      leisure_count_2000
      sport_count_2000
      market_count_2000
      green_part_3000
      prom_part_3000
      office_count_3000
      office_sqm_3000
      trc_count_3000
      trc_sqm_3000
      cafe_count_3000
      cafe_sum_3000_min_price_avg
      cafe_sum_3000_max_price_avg
      cafe_avg_price_3000
      cafe_count_3000_na_price
      cafe_count_3000_price_500
      cafe_count_3000_price_2500
      cafe_count_3000_price_4000
      cafe_count_3000_price_high
      big_church_count_3000
      church_count_3000
      mosque_count_3000
      leisure_count_3000
      sport_count_3000
      market_count_3000
      green_part_5000
      prom_part_5000
      office_sqm_5000
      trc_count_5000
      trc_sqm_5000
      cafe_count_5000
      cafe_sum_5000_min_price_avg
      cafe_count_5000_na_price
      cafe_count_5000_price_500
      cafe_count_5000_price_1000
      cafe_count_5000_price_1500
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      price_doc
      age_of_building
      incineration_raion_bool
      oil_chemistry_raion_bool
      big_market_raion_bool
      water_1line_bool
      railroad_1line_bool
      culture_objects_top_25_bool
      rel_floor
      rel_kitch_sq
      rel_life_sq
      build_year_ten
      ratio_life_sq_full_sq
      ratio_kitch_sq_life_sq
      ratio_kitch_sq_full_sq
      product_type_le
      sub_area_le
      culture_objects_top_25_le
      thermal_power_plant_raion_le
      incineration_raion_le
      oil_chemistry_raion_le
      radiation_raion_le
      railroad_terminal_raion_le
      big_market_raion_le
      nuclear_reactor_raion_le
      ID_metro_le
      ID_railroad_station_walk_le
      water_1line_le
      ID_big_road1_le
      ID_big_road2_le
      railroad_1line_le
      ID_bus_terminal_le
      ecology_le
      is_build_in_progress_le
      material_cat_le
      state_cat_le
      product_type_sll
      sub_area_sll
      culture_objects_top_25_sll
      thermal_power_plant_raion_sll
      incineration_raion_sll
      oil_chemistry_raion_sll
      radiation_raion_sll
      railroad_terminal_raion_sll
      big_market_raion_sll
      nuclear_reactor_raion_sll
      detention_facility_raion_sll
      ID_metro_sll
      ID_railroad_station_walk_sll
      ID_railroad_station_avto_sll
      water_1line_sll
      ID_big_road1_sll
      big_road1_1line_sll
      ID_big_road2_sll
      railroad_1line_sll
      ID_bus_terminal_sll
      ecology_sll
      is_build_in_progress_sll
      material_cat_sll
      state_cat_sll
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      7675
      73.0
      36.0
      17.0
      17.0
      1.0
      2.0
      11.0
      NaN
      14883622.34
      72131
      0.024444
      0.158249
      7567
      3848.0
      4.0
      8687.0
      0
      NaN
      1
      0
      4
      0
      0
      0
      102828
      47783
      55045
      13954
      49242
      8935
      2488
      6447
      7567
      3867
      5731
      3000
      2731
      15057
      7273
      26154
      13689
      12465
      12659
      6564
      1204.0
      12.0
      793.0
      36.0
      179.0
      14.0
      97.0
      64.0
      9.0
      298.0
      382.0
      42.0
      176.0
      1.786095
      0.631513
      0.631513
      0.127746
      0.211999
      0.830687
      0.800951
      0.838160
      2.650983
      2.953143
      34.680939
      2.890078
      4.609301
      3.014127
      0.810526
      2.890537
      10.868389
      14.824881
      15.565422
      16.934712
      2.223887
      2.224715
      1.881386
      22.217934
      5
      8.379305
      14.929271
      1.007503
      3.189669
      9.700928
      2.105310
      12.015274
      1.076679
      0.811599
      4.019799
      4.019799
      15.342829
      3.517383
      2.962486
      14.068434
      0.375622
      5.862093
      2.680298
      2.285872
      3.130025
      0.222339
      0.211999
      1.897119
      1.887930
      9.372257
      5.573473
      5.052496
      4.095476
      0.642124
      0.00
      0.00
      0
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      0
      4.06
      1.27
      0
      0
      0
      0
      2
      750.00
      1250.00
      1000.00
      0
      0
      1
      1
      0
      0
      0
      0
      0
      0
      1
      1
      10.47
      7.43
      0
      0
      0
      0
      2
      750.00
      1250.00
      1000.00
      0
      0
      1
      1
      0
      0
      0
      0
      0
      0
      1
      1
      13.61
      8.44
      0
      0
      0
      0
      3
      666.67
      1166.67
      916.67
      0
      0
      2
      1
      0
      0
      0
      1
      3
      0
      0
      1
      1
      15.92
      8.28
      0
      0
      3
      62100
      10
      750.00
      1250.00
      1000.00
      0
      0
      0
      0
      0
      4
      4
      0
      0
      8
      1
      15.84
      6.22
      138650
      20
      405046
      38
      686.49
      1
      8
      16
      11
      1
      1
      0
      5
      9
      0
      2
      17
      16.128046
      NaN
      True
      False
      False
      False
      False
      False
      1.000000
      0.150685
      0.493151
      NaN
      0.493151
      0.305556
      0.150685
      0
      45
      0
      0
      1
      0
      1
      0
      0
      0
      12
      25
      0
      2
      55
      0
      7
      1
      0
      0
      0
      15.801033
      15.729914
      15.717359
      15.731408
      15.631294
      15.737393
      15.812208
      15.732827
      15.741379
      15.731524
      15.729544
      15.782371
      15.738976
      15.717732
      15.742079
      15.601210
      15.732289
      15.636870
      15.736438
      15.658769
      15.757357
      15.739732
      15.706233
      15.603761
    
    
      8059
      11.0
      11.0
      2.0
      5.0
      2.0
      1.0
      NaN
      3.0
      10071560.22
      102726
      0.048791
      0.000000
      6374
      165.0
      5.0
      9337.0
      1
      4702.0
      5
      1
      23
      2
      5
      87
      75377
      34015
      41362
      14868
      61102
      26756
      8775
      17981
      6374
      3205
      7538
      3585
      3953
      16584
      8501
      14705
      7343
      7362
      13042
      6343
      641.0
      19.0
      4.0
      0.0
      550.0
      48.0
      8.0
      11.0
      1.0
      206.0
      122.0
      42.0
      91.0
      1.798776
      1.291876
      0.582523
      0.377428
      0.185809
      0.985279
      0.465981
      1.659437
      2.325364
      14.002650
      29.369625
      4.238200
      5.481681
      2.639782
      0.514685
      12.114726
      2.301037
      0.189294
      1.310001
      2.109561
      2.301037
      2.902523
      1.622346
      4.742795
      50
      10.902881
      4.169488
      0.624461
      5.162609
      1.846188
      4.797509
      9.892177
      3.609579
      0.773059
      1.255166
      3.526529
      2.069458
      1.728451
      0.551081
      5.850548
      1.222891
      1.519220
      0.185809
      0.300637
      0.112276
      0.683229
      0.197451
      0.373226
      0.264107
      2.761752
      2.985751
      0.224448
      1.592383
      0.057430
      0.17
      0.00
      14
      1
      2720
      42
      985.37
      1597.56
      1291.46
      1
      8
      7
      0
      1
      4
      0
      9
      0
      12.07
      0.00
      44
      363707
      2
      10220
      99
      866.67
      1416.67
      1141.67
      9
      25
      17
      28
      16
      0
      5
      12
      0
      12
      7
      0
      15.03
      0.00
      81
      677136
      5
      122060
      225
      901.93
      1487.92
      1194.93
      18
      49
      50
      56
      39
      13
      0
      14
      0
      18
      21
      0
      11.75
      0.82
      129
      1394447
      17
      457342
      474
      912.41
      1503.45
      1207.93
      39
      103
      115
      114
      69
      30
      4
      24
      45
      0
      24
      36
      3
      9.80
      1.21
      258
      3025460
      28
      1002718
      979
      933.08
      1529.12
      1231.10
      69
      217
      143
      63
      12
      60
      102
      1
      44
      79
      4
      9.38
      4.35
      10742760
      83
      3434795
      2295
      908.42
      157
      539
      537
      562
      339
      135
      26
      133
      207
      1
      89
      161
      14.827112
      106.0
      False
      False
      False
      False
      False
      True
      0.400000
      NaN
      1.000000
      191.0
      1.000000
      NaN
      NaN
      0
      29
      1
      0
      0
      0
      1
      0
      0
      0
      192
      160
      0
      33
      7
      0
      4
      0
      0
      1
      3
      15.801033
      16.524056
      15.971515
      15.731408
      15.743042
      15.737393
      15.812208
      15.732827
      15.741379
      15.731524
      15.729544
      16.191397
      16.237215
      16.391445
      15.742079
      15.949550
      15.732289
      16.025274
      15.736438
      15.948738
      15.828350
      15.739732
      15.831025
      15.838145
    
    
      8114
      85.0
      NaN
      13.0
      22.0
      4.0
      3.0
      NaN
      NaN
      25536296.81
      4001
      0.496315
      0.007122
      275
      NaN
      0.0
      NaN
      0
      NaN
      0
      0
      0
      0
      1
      0
      17790
      8350
      9443
      574
      2566
      861
      244
      617
      275
      143
      264
      136
      128
      646
      311
      3796
      2035
      1762
      506
      261
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2.857265
      2.285812
      2.285812
      1.402479
      1.771738
      5.118129
      0.040645
      0.288289
      1.691135
      19.639820
      51.590936
      4.299245
      5.486797
      0.855866
      0.935755
      7.683392
      17.691722
      20.412259
      21.581123
      22.450272
      2.855063
      3.550355
      1.864007
      26.263040
      50
      30.512432
      17.961847
      9.591465
      6.035566
      10.620355
      4.266105
      16.109519
      6.997373
      2.085627
      3.085270
      3.279429
      14.462098
      8.863268
      7.339226
      25.648392
      2.616487
      13.457175
      9.891581
      1.258676
      4.329688
      1.774558
      1.771738
      1.559771
      1.540396
      13.217829
      14.023084
      10.446626
      5.115783
      0.528478
      38.39
      2.83
      0
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      0
      40.95
      1.52
      0
      0
      0
      0
      2
      1000.00
      1750.00
      1375.00
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
      38.34
      1.30
      0
      0
      1
      17000
      2
      1000.00
      1750.00
      1375.00
      0
      0
      1
      0
      1
      0
      0
      0
      0
      0
      0
      0
      41.01
      1.90
      0
      0
      1
      17000
      3
      833.33
      1500.00
      1166.67
      0
      0
      2
      0
      1
      0
      0
      1
      3
      0
      0
      0
      0
      44.85
      1.97
      0
      0
      2
      22000
      10
      680.00
      1200.00
      940.00
      0
      1
      1
      0
      0
      1
      3
      0
      0
      6
      0
      35.53
      5.77
      117300
      3
      139300
      18
      758.82
      1
      3
      8
      4
      1
      1
      0
      2
      12
      0
      0
      9
      15.840298
      NaN
      False
      False
      False
      False
      False
      False
      0.590909
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      1
      103
      0
      0
      0
      0
      0
      0
      0
      0
      123
      102
      0
      4
      19
      0
      12
      2
      0
      2
      0
      15.602368
      15.670687
      15.717359
      15.731408
      15.743042
      15.737393
      15.690301
      15.732827
      15.741379
      15.731524
      15.729544
      15.668391
      15.492049
      15.691381
      15.742079
      15.503679
      15.732289
      15.634725
      15.736438
      15.582105
      15.467725
      15.739732
      15.919001
      15.603761
    
    
      8138
      53.0
      30.0
      10.0
      16.0
      1.0
      2.0
      8.0
      3.0
      5646405.14
      79576
      0.258663
      0.101872
      4857
      2703.0
      5.0
      7236.0
      0
      NaN
      3
      0
      4
      0
      3
      3
      68630
      33005
      35625
      10019
      51295
      18262
      5511
      12751
      4857
      2424
      4583
      2341
      2242
      11158
      5468
      15292
      7613
      7679
      8865
      4433
      301.0
      9.0
      71.0
      47.0
      71.0
      13.0
      84.0
      4.0
      2.0
      0.0
      52.0
      108.0
      96.0
      1.959499
      1.503698
      1.535023
      0.408673
      0.364994
      0.875814
      0.529494
      0.000000
      1.987032
      3.252337
      42.776109
      3.619377
      4.524221
      2.264785
      1.162500
      2.169200
      11.018216
      13.270117
      13.854330
      15.345902
      2.116263
      2.169200
      0.619521
      13.396969
      101
      19.134967
      4.012248
      0.563397
      2.092171
      4.335473
      4.442725
      9.819833
      2.238991
      0.821462
      0.792641
      2.875125
      9.328147
      2.044473
      1.518608
      5.627885
      1.247381
      5.880034
      1.038248
      0.777665
      0.162024
      0.447976
      0.364994
      1.410243
      1.412551
      3.985309
      13.058708
      7.737190
      7.572115
      0.659322
      0.00
      40.26
      2
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      1
      13.80
      30.07
      3
      71754
      4
      92000
      9
      487.50
      875.00
      681.25
      1
      3
      4
      1
      0
      0
      0
      0
      0
      0
      3
      0
      16.16
      17.90
      5
      106254
      7
      105414
      26
      775.00
      1270.83
      1022.92
      2
      7
      6
      8
      2
      1
      0
      2
      0
      0
      7
      1
      16.81
      17.64
      10
      179554
      12
      305634
      41
      674.36
      1141.03
      907.69
      2
      11
      15
      10
      2
      1
      0
      3
      3
      0
      0
      10
      2
      12.36
      16.69
      14
      529054
      24
      838534
      70
      742.42
      1234.85
      988.64
      4
      20
      5
      3
      0
      3
      4
      0
      0
      24
      2
      15.44
      18.47
      766701
      48
      2311301
      142
      736.15
      12
      34
      43
      39
      10
      4
      0
      11
      21
      1
      0
      46
      16.012735
      33.0
      False
      False
      False
      False
      False
      False
      0.625000
      0.150943
      0.566038
      198.0
      0.566038
      0.266667
      0.150943
      0
      55
      0
      0
      0
      0
      1
      0
      0
      0
      202
      224
      0
      43
      0
      0
      0
      3
      0
      0
      3
      15.801033
      15.743523
      15.717359
      15.731408
      15.743042
      15.737393
      15.812208
      15.732827
      15.741379
      15.731524
      15.729544
      15.696960
      15.772069
      15.749849
      15.742079
      15.786004
      15.732289
      15.692321
      15.736438
      15.781152
      15.840672
      15.739732
      15.706233
      15.838145
    
    
      8147
      41.0
      37.0
      13.0
      17.0
      1.0
      1.0
      NaN
      NaN
      21494094.80
      7122
      0.262459
      0.017647
      489
      NaN
      0.0
      NaN
      0
      NaN
      0
      0
      0
      0
      0
      0
      9553
      4529
      5024
      1021
      4568
      1533
      435
      1099
      489
      254
      469
      242
      228
      1150
      553
      2155
      1206
      950
      900
      465
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      2.769542
      2.155282
      2.155282
      0.618740
      0.827115
      2.031773
      0.469741
      1.281134
      1.882638
      11.672420
      68.215230
      5.684602
      7.053650
      5.603917
      0.226807
      7.370562
      20.604213
      23.724567
      24.996049
      25.701769
      4.841627
      5.344093
      1.250001
      28.972859
      32
      23.727404
      17.317707
      7.550486
      4.835593
      10.058643
      9.568216
      14.068657
      6.187529
      0.888687
      5.093315
      13.230213
      21.811800
      10.140839
      4.375357
      26.882114
      2.710695
      18.634244
      9.017858
      3.407867
      5.350487
      1.492907
      0.827115
      1.841105
      0.630511
      3.003040
      21.372786
      15.160570
      13.306110
      1.554069
      0.76
      0.00
      0
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      0
      10.51
      0.00
      0
      0
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      1
      0
      0
      1
      0
      12.80
      0.42
      0
      0
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      11.64
      1.45
      0
      0
      0
      0
      2
      1000.00
      1500.00
      1250.00
      0
      0
      0
      2
      0
      0
      0
      1
      2
      0
      0
      3
      0
      11.09
      2.01
      0
      0
      0
      0
      2
      1000.00
      1500.00
      1250.00
      0
      0
      0
      0
      0
      1
      3
      0
      0
      5
      0
      17.97
      4.45
      0
      5
      262000
      18
      700.00
      2
      4
      4
      8
      0
      0
      0
      1
      8
      1
      0
      12
      15.310076
      NaN
      False
      False
      False
      False
      False
      False
      0.764706
      NaN
      0.902439
      NaN
      0.902439
      NaN
      NaN
      1
      105
      0
      0
      0
      0
      0
      0
      0
      0
      164
      134
      0
      11
      31
      0
      13
      2
      0
      0
      0
      15.602368
      15.582945
      15.717359
      15.731408
      15.743042
      15.737393
      15.690301
      15.732827
      15.741379
      15.731524
      15.729544
      15.445493
      15.394814
      15.394814
      15.742079
      15.703291
      15.732289
      15.278771
      15.736438
      15.611353
      15.467725
      15.739732
      15.706233
      15.603761



In [9]:

    
important_feats = ["full_sq", "life_sq", "kitch_sq", "max_floor"]
# important_feats = ["full_sq", "life_sq"]
# Учим модели для заполнения NA важных полей, последовательность важна
feats_to_remove = ["price_doc", "rel_kitch_sq", "rel_life_sq", "id", "build_year_cat_le", 
                   "age_of_building", "rel_floor", "num_room_cat_le", "build_year_ten", 
                   "ratio_life_sq_full_sq", "ratio_kitch_sq_full_sq", "ratio_kitch_sq_life_sq"]



In [10]:

    
%%cache na_models.pkl na_models
na_models = {}
xgb_params = {
    'max_depth': 5,
    'n_estimators': 200,
    'learning_rate': 0.05,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
for f in important_feats:
    t = train[train[f].notnull()]
    fX = t.drop([f] + feats_to_remove, axis=1, errors="ignore")
    fy = t[f].values
    dtrain_all = xgb.DMatrix(fX.values, fy, feature_names=fX.columns)
    model = xgb.train(xgb_params, dtrain_all, num_boost_round=400, verbose_eval=40)
    na_models[f] = model
    print f
    print feat_imp(model).head(10)









    



[Skipped the cell's code and loaded variables na_models from file '/Users/evgeny/PycharmProjects/kaggle/sber/na_models.pkl'.]
full_sq
              feature  importance
202           life_sq        1924
78           kitch_sq         732
23          max_floor         559
171          num_room         431
195             floor         381
222   product_type_le         223
34   material_cat_sll         158
145     state_cat_sll         128
2    product_type_sll         112
90              state         110
life_sq
                     feature  importance
84                   full_sq        1605
72                  kitch_sq         591
23                 max_floor         524
85                     state         484
178                    floor         470
2           product_type_sll         264
157                 num_room         221
82   is_build_in_progress_le         203
34          material_cat_sll         201
137            state_cat_sll         178
kitch_sq
              feature  importance
92            full_sq        1507
157           life_sq         974
22          max_floor         597
189             floor         357
167          num_room         274
33   material_cat_sll         237
38           material         217
93              state         142
214   product_type_le         127
144     state_cat_sll         126
max_floor
              feature  importance
95            full_sq         912
202             floor         807
83           kitch_sq         649
185           life_sq         600
39   material_cat_sll         313
45           material         231
96              state         156
150     state_cat_sll         134
170   kindergarten_km         123
2    product_type_sll         121



In [11]:

    
def fill_na_xgb(df_orig):
    df = df_orig.copy()
    for f in important_feats:
        X_pr = df[df[f].isnull()].drop([f] + feats_to_remove, axis=1, errors="ignore")
        if not len(X_pr):
            continue
        X_pr = xgb.DMatrix(X_pr.values, feature_names=X_pr.columns)

        df.loc[df[f].isnull(), f] = na_models[f].predict(X_pr).round()
        df[f] = df[f].astype(int)
    return df



In [12]:

    
train = fill_na_xgb(train)

Обучение моделей



In [33]:

    
from sklearn.model_selection import train_test_split

X = train.drop(["price_doc"], axis=1)
y = train["price_doc"].values

bound = int(len(X) * 0.7)
X_train, X_val, y_train, y_val = X[:bound].copy(), X[bound+1:].copy(), y[:bound].copy(), y[bound+1:].copy()

XGBoost



In [34]:

    
dtrain_all = xgb.DMatrix(X.values, y, feature_names=X.columns)
dtrain = xgb.DMatrix(X_train.values, y_train, feature_names=X.columns)
dval = xgb.DMatrix(X_val.values, y_val, feature_names=X.columns)



In [35]:

    
xgb_params = {
    'eta': 0.01,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Uncomment to tune XGB `num_boost_rounds`
model = xgb.train(xgb_params, dtrain, num_boost_round=4000, evals=[(dval, 'val')],
                  early_stopping_rounds=20, verbose_eval=40)

num_boost_round = model.best_iteration









    



[0]	val-rmse:15.1085
Will train until val-rmse hasn't improved in 20 rounds.
[40]	val-rmse:10.1258
[80]	val-rmse:6.795
[120]	val-rmse:4.56841
[160]	val-rmse:3.08318
[200]	val-rmse:2.09569
[240]	val-rmse:1.44362
[280]	val-rmse:1.01767
[320]	val-rmse:0.74555
[360]	val-rmse:0.57788
[400]	val-rmse:0.479446
[440]	val-rmse:0.424684
[480]	val-rmse:0.395296
[520]	val-rmse:0.379696
[560]	val-rmse:0.37133
[600]	val-rmse:0.366656
[640]	val-rmse:0.363883
[680]	val-rmse:0.362267
[720]	val-rmse:0.361277
[760]	val-rmse:0.360505
[800]	val-rmse:0.359963
[840]	val-rmse:0.359534
[880]	val-rmse:0.359215
[920]	val-rmse:0.358886
[960]	val-rmse:0.358617
[1000]	val-rmse:0.358462
[1040]	val-rmse:0.358334
[1080]	val-rmse:0.358153
[1120]	val-rmse:0.357997
[1160]	val-rmse:0.357889
Stopping. Best iteration:
[1148]	val-rmse:0.357838



In [16]:

    
cv_output = xgb.cv(xgb_params, dtrain_all, num_boost_round=4000, 
                   verbose_eval=100, early_stopping_rounds=100, nfold=5)









    



[0]	train-rmse:15.0807+0.00143998	test-rmse:15.0807+0.00574543
[100]	train-rmse:5.53406+0.000562041	test-rmse:5.53423+0.00559221
[200]	train-rmse:2.05268+0.000396049	test-rmse:2.05461+0.00530618
[300]	train-rmse:0.809839+0.000644401	test-rmse:0.817379+0.00478175
[400]	train-rmse:0.412843+0.00126441	test-rmse:0.431577+0.00442863
[500]	train-rmse:0.316289+0.00163335	test-rmse:0.34555+0.00495535
[600]	train-rmse:0.294162+0.001633	test-rmse:0.33052+0.00537192
[700]	train-rmse:0.285144+0.00163495	test-rmse:0.32731+0.0056672
[800]	train-rmse:0.27898+0.00175593	test-rmse:0.326178+0.00576283
[900]	train-rmse:0.273645+0.00176602	test-rmse:0.325591+0.00585998
[1000]	train-rmse:0.268594+0.00162843	test-rmse:0.325201+0.00590836
[1100]	train-rmse:0.263946+0.00164506	test-rmse:0.324894+0.00600377
[1200]	train-rmse:0.259417+0.00146795	test-rmse:0.324697+0.00607405
[1300]	train-rmse:0.255426+0.00141924	test-rmse:0.324576+0.00604754
[1400]	train-rmse:0.251505+0.00135145	test-rmse:0.324488+0.00606445
[1500]	train-rmse:0.247599+0.00139538	test-rmse:0.32446+0.00607018
[1600]	train-rmse:0.24397+0.00147552	test-rmse:0.324437+0.00602382



In [36]:

    
xgbmodel = xgb.train(xgb_params, dtrain, num_boost_round=num_boost_round, verbose_eval=40)

y_pred = xgbmodel.predict(dtrain)

print "predict-train:", rmse(y_pred, y_train)
submdf = pd.DataFrame({"id": X_train.index, "price_doc": unlog(y_pred)})
submdf.to_csv("xgb_train_preds.csv", index=False)

y_pred = xgbmodel.predict(dval)

print "predict-val:", rmse(y_pred, y_val)
submdf = pd.DataFrame({"id": X_val.index, "price_doc": unlog(y_pred)})
submdf.to_csv("xgb_val_preds.csv", index=False)

feat_imp(model).head(10)









    



predict-train: 0.247358441492
predict-val: 0.357843273327






    Out[36]:







  
    
      
      feature
      importance
    
  
  
    
      114
      full_sq
      2183
    
    
      130
      age_of_building
      650
    
    
      192
      life_sq
      561
    
    
      236
      floor
      545
    
    
      28
      max_floor
      540
    
    
      198
      kindergarten_km
      338
    
    
      99
      kitch_sq
      297
    
    
      44
      rel_life_sq
      296
    
    
      115
      state
      278
    
    
      83
      workplaces_km
      264

LightGBM



In [37]:

    
RS = 20170501
np.random.seed(RS)

FACT_ROUNDS=0
ROUNDS = 2000
lgb_params = {
    'objective': 'regression',
    'metric': 'rmse',
    'boosting': 'gbdt',
    'learning_rate': 0.01,
#     'verbose': 1,
#     'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': RS,
#     'feature_fraction': 0.7,
#     'feature_fraction_seed': RS,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
#     'max_bin': 100,
    'max_depth': 10,
    'num_rounds': ROUNDS
}
lgb_train_all = lgb.Dataset(X, y)
lgb_train = lgb.Dataset(X_train, y_train)



In [38]:

    
cvres = pd.DataFrame(lgb.cv(params=lgb_params, train_set=lgb_train, nfold=5, shuffle=False, 
                            early_stopping_rounds=100, verbose_eval=100, num_boost_round=ROUNDS))
FACT_ROUNDS = len(cvres)









    



[100]	cv_agg's rmse: 0.360021 + 0.0125667
[200]	cv_agg's rmse: 0.329922 + 0.0117053
[300]	cv_agg's rmse: 0.321521 + 0.0115273
[400]	cv_agg's rmse: 0.318169 + 0.0113741
[500]	cv_agg's rmse: 0.31661 + 0.0112249
[600]	cv_agg's rmse: 0.315936 + 0.0111462
[700]	cv_agg's rmse: 0.315747 + 0.0108937
[800]	cv_agg's rmse: 0.315708 + 0.0106772
[900]	cv_agg's rmse: 0.315846 + 0.0105506



In [39]:

    
lgbmodel = lgb.train(lgb_params, lgb_train, num_boost_round=FACT_ROUNDS or ROUNDS)

pd.DataFrame({
    "name": lgbmodel.feature_name(), 
    "imp": lgbmodel.feature_importance()}
).sort_values("imp", ascending=False).head(20)









    Out[39]:







  
    
      
      imp
      name
    
  
  
    
      0
      1694
      full_sq
    
    
      236
      961
      age_of_building
    
    
      3
      470
      max_floor
    
    
      2
      338
      floor
    
    
      272
      333
      sub_area_sll
    
    
      282
      302
      ID_metro_sll
    
    
      245
      299
      rel_life_sq
    
    
      1
      292
      life_sq
    
    
      7
      260
      state
    
    
      283
      254
      ID_railroad_station_walk_sll
    
    
      60
      252
      kindergarten_km
    
    
      248
      252
      ratio_kitch_sq_life_sq
    
    
      97
      249
      public_healthcare_km
    
    
      130
      240
      prom_part_1000
    
    
      70
      232
      public_transport_station_min_walk
    
    
      111
      231
      green_part_500
    
    
      6
      229
      kitch_sq
    
    
      244
      221
      rel_kitch_sq
    
    
      99
      220
      workplaces_km
    
    
      102
      214
      additional_education_km



In [40]:

    
y_pred = lgbmodel.predict(X_train)

print "predict-train:", rmse(y_pred, y_train)
submdf = pd.DataFrame({"id": X_train.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lgb_train_preds.csv", index=False)

y_pred = lgbmodel.predict(X_val)

print "predict-val:", rmse(y_pred, y_val)
submdf = pd.DataFrame({"id": X_val.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lgb_val_preds.csv", index=False)









    



predict-train: 0.239486134262
predict-val: 0.356107088195

Vowpal Wabbit



In [123]:

    
from vowpalwabbit.sklearn_vw import VWRegressor

Lasso



In [29]:

    
from sklearn.base import TransformerMixin
from scipy.stats import skew

class SkewLogAlign(TransformerMixin):
    skewed_feats = None
    skew_treshold = 0.75

    def __init__(self, skew_treshold=0.75):
        self.skew_treshold = skew_treshold

    def fit(self, X, y=None):
        #log transform skewed numeric features:
        df = pd.DataFrame(X, dtype=np.float64)

        skewed_feats = df.apply(lambda x: skew(x.dropna())) #compute skewness
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        self.skewed_feats = skewed_feats.index
        return self

    def transform(self, X):
        df = pd.DataFrame(X, dtype=np.float64)
        df[self.skewed_feats] = np.log1p(df[self.skewed_feats].values)
        return df.values

import sys
class FillNaWithConstant(TransformerMixin):
    nan_value = 0
    inf_value = None
    minf_value = None
    def __init__(self, nan_value=0, inf_value=sys.maxint - 1, minf_value=-sys.maxint - 1):
        self.nan_value = nan_value
        self.inf_value = inf_value
        self.minf_value = minf_value
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = pd.DataFrame(X).fillna(self.nan_value)
        df = df.replace(np.inf, self.inf_value)
        df = df.replace(-np.inf, self.minf_value)
        return df.values



In [45]:

    
from sklearn.pipeline import Pipeline
lasso_feat_pipeline = Pipeline([
    ("skew", SkewLogAlign()),
    ("fillna", FillNaWithConstant()),
])



In [143]:

    
from sklearn.linear_model import LassoCV
LASSO_alphas = [1, 0.1, 0.001, 0.0005]
lasso_cv_model = LassoCV(alphas = [1, 0.1, 0.001, 0.0005], cv=5, max_iter=50000, verbose=True, n_jobs=-1)
lasso_cv_model.fit(lasso_feat_pipeline.transform(X.values), y)
print "alpha:", lasso_cv_model.alpha_
print "MSE:"
print zip(LASSO_alphas, np.sqrt(lasso_cv_model.mse_path_))
print pd.Series(lasso_cv_model.coef_, index=X.columns).sort_values(ascending=False)[:20]









    



/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: divide by zero encountered in log1p
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in log1p
....................[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.8s finished






    



alpha: 0.0005
MSE: [[ 0.21501046  0.23015306  0.23120176  0.23427819  0.30341004]
 [ 0.15488957  0.16189127  0.16288215  0.16238237  0.22927763]
 [ 0.11251749  0.1203526   0.10807458  0.12416594  0.16574154]
 [ 0.11215568  0.11882564  0.10656277  0.12069194  0.16437967]]



In [46]:

    
from sklearn.linear_model import Lasso

best_alpha = 0.001
lasso_model = Pipeline([
    ("feat", lasso_feat_pipeline),
    ("clf", Lasso(alpha=best_alpha, max_iter=50000))
])

lasso_model.fit(X_train.values, y_train)









    



/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: divide by zero encountered in log1p
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in log1p
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/sklearn/linear_model/coordinate_descent.py:484: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Fitting data with very small alpha may cause precision problems.
  ConvergenceWarning)






    Out[46]:





Pipeline(steps=[('feat', Pipeline(steps=[('skew', <__main__.SkewLogAlign object at 0x10785e190>), ('fillna', <__main__.FillNaWithConstant object at 0x10785e750>)])), ('clf', Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=50000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])



In [47]:

    
y_pred = lasso_model.predict(X_train.values)

print "predict-train:", rmse(y_pred, y_train)
submdf = pd.DataFrame({"id": X_train.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lasso_train_preds.csv", index=False)

y_pred = lasso_model.predict(X_val.values)

print "predict-validation:", rmse(y_pred, y_val)
submdf = pd.DataFrame({"id": X_val.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lasso_val_preds.csv", index=False)









    



/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: divide by zero encountered in log1p
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in log1p






    



predict-train: 0.333160734842
predict-validation: 0.407328141516

Submission



In [41]:

    
test_pr = preprocess(test)
train_pr = preprocess(train_raw)
test_pr = apply_categorial(test_pr, train_pr)
test_pr = feature_exclude(test_pr)

test_pr = fill_na_xgb(test_pr)









    



/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:32: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:33: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

XGBoost



In [42]:

    
# XGB
dtest = xgb.DMatrix(test_pr.values, feature_names=test_pr.columns)
y_pred = xgbmodel.predict(dtest)

submdf = pd.DataFrame({"id": test_pr.index, "price_doc": unlog(y_pred)})
submdf.to_csv("xgb_sub.csv", index=False)
!head xgb_sub.csv









    



id,price_doc
30474,4742232.5
30475,6859833.0
30476,4995267.5
30477,5349565.5
30478,4658073.0
30479,5667926.0
30480,4403455.0
30481,4455123.5
30482,4962353.0

LightGBM



In [43]:

    
y_pred = lgbmodel.predict(test_pr)
submdf = pd.DataFrame({"id": test_pr.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lgb_sub.csv", index=False)
!head lgb_sub.csv









    



id,price_doc
30474,6290504.09921
30475,9001023.32542
30476,6180642.37022
30477,7994646.047
30478,5992177.9955
30479,6900884.07179
30480,5299139.72796
30481,5150081.11666
30482,5607004.45297

Lasso



In [51]:

    
y_pred = lasso_model.predict(test_pr.values)
submdf = pd.DataFrame({"id": test_pr.index, "price_doc": unlog(y_pred)})
submdf.to_csv("lasso_sub.csv", index=False)
!head lasso_sub.csv









    



id,price_doc
30474,5621747.59302
30475,8050329.46028
30476,5528565.7515
30477,6401597.65974
30478,4977892.1805
30479,7481715.86314
30480,4968474.93149
30481,4040944.24532
30482,5150697.61831






    



/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: divide by zero encountered in log1p
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in log1p

Ensemble



In [46]:

    
models = ["lgb", "xgb"]



In [47]:

    
etrain = pd.DataFrame(index=X_val.index)

etrain = etrain.join(train[["price_doc"]])

for i, p in enumerate(models):
    pred = pd.read_csv("%s_val_preds.csv" % p, index_col="id", names=["id", "p_%s" % i], header=0)
    etrain = etrain.join(pred)

eX = etrain.drop("price_doc", axis=1)

ey = etrain["price_doc"].values

etrain.head()









    Out[47]:







  
    
      
      price_doc
      p_0
      p_1
    
    
      id
      
      
      
    
  
  
    
      24320
      15.952724
      8.227190e+06
      8096738.0
    
    
      24321
      15.621715
      6.601231e+06
      6750821.5
    
    
      24322
      15.919645
      8.226800e+06
      7768140.0
    
    
      24324
      15.645554
      7.806705e+06
      8211750.5
    
    
      24325
      15.279985
      3.973034e+06
      4257109.0

Lasso



In [48]:

    
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LassoCV
emodel = Pipeline([
    ("skew", SkewLogAlign()),
    ("fillna", FillNaWithConstant()),
    ("clf", LassoCV(alphas=None, cv=5, max_iter=50000, verbose=True, n_jobs=-1))
]) 
emodel.fit(eX.values, ey)

lmodel = emodel.named_steps["clf"]
print "alpha:", lmodel.alpha_
print "MSE:"
print np.sqrt(lmodel.mse_path_)
print pd.Series(lmodel.coef_, index=eX.columns).sort_values(ascending=False)[:20]









    



....................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.2s finished






    



alpha: 0.000163371864509
MSE:
[[ 0.47509683  0.49749565  0.54020405  0.58316248  0.57470411]
 [ 0.45406655  0.47593357  0.5231112   0.56666981  0.56317275]
 [ 0.43491715  0.45635357  0.5078579   0.54906123  0.54306095]
 [ 0.41751708  0.43861495  0.49428616  0.53325188  0.52492149]
 [ 0.40174069  0.42258294  0.48224566  0.51908954  0.50859868]
 [ 0.38746783  0.40812849  0.47159411  0.50643002  0.49394406]
 [ 0.37458372  0.39511955  0.46219753  0.49513733  0.48081652]
 [ 0.36297881  0.38348797  0.4539305   0.48508373  0.4690824 ]
 [ 0.35254876  0.37308306  0.4466763   0.47614988  0.45861559]
 [ 0.34319447  0.36379772  0.44032699  0.46822479  0.44929767]
 [ 0.33482214  0.35553042  0.4347833   0.46120582  0.44101799]
 [ 0.32734333  0.34818582  0.42995451  0.45499847  0.43367362]
 [ 0.32062695  0.34167468  0.42573704  0.44950673  0.42716939]
 [ 0.31464964  0.33591415  0.42206894  0.44463321  0.42141765]
 [ 0.309339    0.33082708  0.41889381  0.44033892  0.41633814]
 [ 0.30462787  0.32634253  0.41615134  0.43655866  0.41185093]
 [ 0.30045424  0.32239553  0.41378768  0.43323364  0.4078631 ]
 [ 0.29676134  0.31892707  0.41175497  0.43031111  0.40434917]
 [ 0.29349742  0.3158829   0.41001076  0.42774391  0.40125579]
 [ 0.2906153   0.31321484  0.40851758  0.42548996  0.39853436]
 [ 0.28807244  0.31087898  0.40724239  0.42351184  0.39614156]
 [ 0.28583053  0.30883607  0.40615619  0.42177637  0.39403854]
 [ 0.28385505  0.30705106  0.40523357  0.42025409  0.39219088]
 [ 0.28211508  0.30549272  0.40445224  0.41891905  0.39056807]
 [ 0.28058316  0.30413364  0.40379278  0.41774829  0.38914276]
 [ 0.27923472  0.30294878  0.40323827  0.41672163  0.38789131]
 [ 0.27804795  0.30191686  0.40277398  0.41582127  0.38679222]
 [ 0.27700353  0.3010186   0.40238705  0.41503159  0.38582708]
 [ 0.27608441  0.30023709  0.40206646  0.41433887  0.3849793 ]
 [ 0.2752754   0.29955721  0.40180257  0.41373109  0.38423465]
 [ 0.27456319  0.29896673  0.40158699  0.41319765  0.38358025]
 [ 0.27393609  0.29845367  0.40141255  0.41272932  0.38300517]
 [ 0.27338373  0.2980081   0.40127311  0.41231801  0.38249928]
 [ 0.27289696  0.29762163  0.40116324  0.41195662  0.38205446]
 [ 0.27246786  0.29728618  0.40107834  0.41163895  0.38166296]
 [ 0.27208939  0.29699548  0.40101438  0.41135957  0.38131806]
 [ 0.27175539  0.29674359  0.40096815  0.41111374  0.38101443]
 [ 0.27146045  0.29652511  0.40093654  0.41089728  0.3807466 ]
 [ 0.27119983  0.29633633  0.40091717  0.41070658  0.38051041]
 [ 0.27096936  0.29617258  0.40090789  0.41053845  0.38030179]
 [ 0.27076545  0.29603128  0.40090681  0.41039013  0.38011761]
 [ 0.27058483  0.29590902  0.40091248  0.41025918  0.37995468]
 [ 0.27042471  0.2958033   0.40092349  0.41014347  0.37981066]
 [ 0.27028268  0.29571192  0.40093883  0.41004114  0.37968304]
 [ 0.27015658  0.29563327  0.40095739  0.40995057  0.37956987]
 [ 0.27004445  0.29556529  0.40097856  0.40987035  0.37946945]
 [ 0.26994471  0.29550685  0.4010016   0.40979921  0.37938026]
 [ 0.26985583  0.29545664  0.40102594  0.40973608  0.37930097]
 [ 0.26977664  0.29541322  0.40105113  0.40967999  0.37923063]
 [ 0.26970592  0.29537598  0.4010769   0.4096301   0.37916776]
 [ 0.26964273  0.29534407  0.40110271  0.4095857   0.37911173]
 [ 0.26958624  0.29531672  0.40112844  0.40954611  0.37906175]
 [ 0.2695356   0.29529332  0.40115378  0.4095108   0.37901691]
 [ 0.26949023  0.29527329  0.40117879  0.40947926  0.37897685]
 [ 0.26944945  0.29525617  0.40120311  0.40945105  0.37894104]
 [ 0.26941283  0.29524154  0.40122676  0.40942579  0.37890899]
 [ 0.26937987  0.29522905  0.40124957  0.40940315  0.37888008]
 [ 0.2693502   0.29521868  0.40127169  0.40938284  0.37885419]
 [ 0.26932342  0.29520951  0.40129278  0.40936459  0.37883076]
 [ 0.26929922  0.29520198  0.40131303  0.40934817  0.37880975]
 [ 0.26927737  0.29519583  0.40133244  0.40933338  0.3787907 ]
 [ 0.26925756  0.29519056  0.40135089  0.40932004  0.3787736 ]
 [ 0.26923959  0.29518603  0.40136851  0.409308    0.37875805]
 [ 0.2692233   0.29518243  0.40138521  0.40929712  0.37874389]
 [ 0.26920852  0.2951793   0.40140101  0.40928727  0.37873118]
 [ 0.26919506  0.29517687  0.40141604  0.40927834  0.37871957]
 [ 0.26918282  0.29517473  0.40143023  0.40927024  0.37870918]
 [ 0.26917163  0.29517312  0.4014437   0.40926289  0.37869943]
 [ 0.26916144  0.29517196  0.40145629  0.4092562   0.37869094]
 [ 0.26915215  0.29517088  0.40146823  0.40925012  0.37868291]
 [ 0.26914366  0.29517014  0.40147956  0.40924457  0.37867554]
 [ 0.26913587  0.29516939  0.4014901   0.40923951  0.37866896]
 [ 0.26912874  0.29516922  0.40150009  0.40923489  0.37866291]
 [ 0.26912218  0.29516895  0.40150947  0.40923067  0.37865732]
 [ 0.2691162   0.29516858  0.40151826  0.40922681  0.37865237]
 [ 0.26911068  0.29516868  0.40152649  0.40922326  0.37864759]
 [ 0.26910561  0.29516863  0.4015343   0.40922002  0.37864337]
 [ 0.26910096  0.2951687   0.40154162  0.40921704  0.37863947]
 [ 0.26909665  0.29516889  0.40154836  0.4092143   0.37863585]
 [ 0.26909271  0.29516886  0.40155477  0.4092118   0.37863249]
 [ 0.26908906  0.29516923  0.40156076  0.40920948  0.37862937]
 [ 0.26908569  0.29516937  0.40156635  0.40920736  0.37862646]
 [ 0.26908259  0.29516957  0.40157158  0.4092054   0.37862375]
 [ 0.26907973  0.29516982  0.40157646  0.40920359  0.37862123]
 [ 0.26907709  0.29517012  0.40158102  0.40920193  0.37861887]
 [ 0.26907466  0.29517047  0.40158528  0.40920039  0.37861666]
 [ 0.26907241  0.29517084  0.40158924  0.40919898  0.3786146 ]
 [ 0.26907033  0.29517124  0.40159294  0.40919767  0.37861267]
 [ 0.26906841  0.29517167  0.4015964   0.40919646  0.37861085]
 [ 0.26906664  0.29517211  0.4015997   0.40919534  0.37860936]
 [ 0.26906496  0.29517257  0.40160279  0.4091943   0.37860796]
 [ 0.26906341  0.29517303  0.40160567  0.40919334  0.37860665]
 [ 0.26906198  0.29517322  0.40160835  0.40919245  0.37860542]
 [ 0.26906065  0.29517343  0.40161085  0.40919162  0.37860427]
 [ 0.26905942  0.29517364  0.40161319  0.40919086  0.37860319]
 [ 0.26905828  0.29517387  0.40161536  0.40919015  0.37860217]
 [ 0.26905722  0.2951741   0.40161738  0.40918949  0.37860121]
 [ 0.26905624  0.29517434  0.40161926  0.40918888  0.37860031]
 [ 0.26905533  0.29517459  0.40162101  0.40918832  0.37859946]
 [ 0.26905449  0.29517484  0.40162264  0.40918779  0.37859866]]
p_0    0.737447
p_1    0.263791
dtype: float64

LightGBM



In [31]:

    
eFACT_ROUNDS = 0



In [ ]:

    
elgb_train = lgb.Dataset(eX, ey)
cvres = pd.DataFrame(lgb.cv(params=lgb_params, train_set=elgb_train, nfold=7, shuffle=False, 
                            early_stopping_rounds=100, verbose_eval=100, num_boost_round=ROUNDS))
eFACT_ROUNDS = len(cvres)



In [ ]:

    
emodel = lgb.train(lgb_params, elgb_train, num_boost_round=eFACT_ROUNDS or ROUNDS)



In [49]:

    
etest = test_pr[[]].copy()
for i, p in enumerate(models):
    pred = pd.read_csv("%s_sub.csv" % p, index_col="id", names=["id", "p_%s" % i], header=0)
    etest = etest.join(pred)

y_pred = emodel.predict(etest.values)
df = pd.DataFrame({"id": etest.index, "price_doc": unlog(y_pred)})
df.to_csv("ensemble_sub.csv", index=False)
!head ensemble_sub.csv









    



id,price_doc
30474,5991274.56122
30475,8601309.46811
30476,5995587.18151
30477,7380798.21396
30478,5753169.26788
30479,6723681.8604
30480,5177294.26037
30481,5085121.49278
30482,5570296.78356

Результаты испытаний

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude, xgb fillna, predict price meter:

val-rmse:42206.6
predict-train: 36746.0165399
kaggle: 0.31331

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude, xgb fillna, predict price doc:

val-rmse:2.57852e+06
train-rmse:1.90168e+06+26844.3  test-rmse:2.66642e+06+56338.9
predict-train: 2021259.19865
kaggle: 0.31386

val-rmse:42206.6
predict-train: 36746.0165399
kaggle: 0.31331

val-rmse:2.55793e+06
train-rmse:1.74066e+06+28727.3  test-rmse:2.65025e+06+64969.5
predict-train: 1881896.66663
kaggle: 0.31344

val-rmse:2.54654e+06
train-rmse:1.74594e+06+24020    test-rmse:2.66053e+06+67300.3
predict-train: 1883352.60935
kaggle: 0.31364

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:

val-rmse:2.55613e+06
train-rmse:1.74466e+06+27385.6  test-rmse:2.66422e+06+69734.1
predict-train: 1888051.35357
kaggle: 0.31366

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro with other ID, ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:

val-rmse:2.58557e+06
train-rmse:1.98509e+06+26803.7  test-rmse:2.68755e+06+59691.1
predict-train: 2092731.29028
kaggle: 0.31731

#

5*200, no macro, add rel features, no log price, train_without_noise:

val-rmse:2.63772e+06
train-rmse:1.9989e+06+10986.4   test-rmse:2.69158e+06+53020
predict-train: 2076010.27131
kaggle: 0.31720

5*200, no macro, add rel features, no log price, train_with_noise:

val-rmse:2.53378e+06
train-rmse:1.95069e+06+16166.4  test-rmse:2.69703e+06+61455.1
predict-train: 2054421.59869
kaggle: 0.32056

5*200, macro, add rel features, no log price, train_without_noise:

val-rmse:2.79632e+06
train-rmse:1.81015e+06+19781.2  test-rmse:2.6641e+06+123875
predict-train: 1904063.27368
kaggle: 0.32976

5*200, no macro, add rel features, no log price, train_without_noise:

val-rmse:2.61682e+06
train-rmse:1.81123e+06+27681.2  test-rmse:2.66923e+06+53925.7
predict-train: 1899129.43771
kaggle: 0.31592

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.61055e+06
train-rmse:1.71826e+06+30076.1  test-rmse:2.66515e+06+54583.5
predict-train: 1814572.97424
kaggle: 0.31602

7*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.59955e+06
train-rmse:1.41393e+06+21208.1  test-rmse:2.6763e+06+35553.3
predict-train: 1548257.49121
kaggle: 0.31768

4*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.63407e+06
train-rmse:1.96513e+06+21470.8  test-rmse:2.69417e+06+74288.3
predict-train: 2062299.41091
kaggle: 0.31952

7*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.59955e+06
train-rmse:1.41393e+06+21208.1  test-rmse:2.6763e+06+35553.3
predict-train: 1548257.49121

5*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.61055e+06
train-rmse:1.71826e+06+30076.1  test-rmse:2.66515e+06+54583.5
predict-train: 1814572.97424

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna:

val-rmse:2.61664e+06
train-rmse:1.77892e+06+23111    test-rmse:2.65829e+06+56398.6
predict-train: 1875799.54634
kaggle: 0.31521

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean:

val-rmse:2.6265e+06
train-rmse:1.78478e+06+22545.4  test-rmse:2.66179e+06+60626.3
predict-train: 1881672.27588
kaggle: 0.31476

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, no super features + Label Encoding:

val-rmse:2.56494e+06
train-rmse:1.78862e+06+18589.1  test-rmse:2.69283e+06+79861.4
predict-train: 1923466.41923
kaggle: 0.31434

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, remove material state num_room:

val-rmse:2.56932e+06
train-rmse:1.88495e+06+20133.7  test-rmse:2.69624e+06+70491.2
predict-train: 1979198.19201
kaggle: 0.31513

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro/bus...:

val-rmse:2.60017e+06
train-rmse:1.80654e+06+19453.5  test-rmse:2.68203e+06+68169.5
predict-train: 1906439.98603
kaggle: 0.31927

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, remove 50 features:

val-rmse:2.93665e+06
train-rmse:1.73425e+06+19462.4  test-rmse:2.68682e+06+140661
predict-train: 1861268.6455
kaggle: 0.31555

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, remove 50 features, add ratio feats:

val-rmse:2.59747e+06
train-rmse:1.75828e+06+26639.4  test-rmse:2.68491e+06+67201.8
predict-train: 1875707.6581
kaggle: 0.31760

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, superfeatures + Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle:

val-rmse:2.5419e+06
train-rmse:1.74381e+06+22710.7  test-rmse:2.65787e+06+66889.9
predict-train: 1862467.67153
kaggle: 0.31716

val-rmse:2.5676e+06
train-rmse:1.81485e+06+24274    test-rmse:2.67324e+06+60153.1
predict-train: 1947645.83102
kaggle: 0.31376

Feature Greedy selection



In [45]:

    
from tqdm import tqdm
def get_best_score(train):
    xgb_params = {
        'max_depth': 5,
        'n_estimators': 200,
        'learning_rate': 0.01,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }
    cvres = xgb.cv(xgb_params, train, num_boost_round=4000, early_stopping_rounds=40)
    return cvres["test-rmse-mean"].min(), cvres["test-rmse-mean"].argmin()

def df2DMatrix(df):
    return xgb.DMatrix(data=df.drop("price_doc", axis=1).values, label=df["price_doc"].values)

def greedy_remove_features(df, feature_importances):
    train = df
    with open("greedy_search.tsv", "a") as f:
        best_score, iterno = get_best_score(df2DMatrix(df))
        f.write("\t".join(["INITIAL", str(best_score), str(iterno)]) + "\n")
        to_analyze = sorted(feature_importances.items(), key=lambda x: x[1])
        for feat, feat_importance in tqdm(to_analyze):
            f.flush()
            candidate_train = train.drop(feat, axis=1)
            cand_best_score, iterno = get_best_score(df2DMatrix(candidate_train))

            if cand_best_score > best_score:
                # стало хуже, оставляем фичу
                f.write("\t".join([feat, str(cand_best_score), str(best_score), str(feat_importance), str(iterno), "skip"]) + "\n")
                f.flush()
                continue

            f.write("\t".join([feat, str(cand_best_score), str(best_score), str(feat_importance), str(iterno), "remove"]) + "\n")
            best_score = cand_best_score
            train = candidate_train



In [47]:

    
feature_importances = imp_features.set_index("feature").to_dict()["importance"]

train_gs = train
with open("greedy_search.tsv") as gs:
    for line in gs:
        row = line.strip().split("\t")
        if len(row) < 6:
            continue
        if row[5] == "remove":
            try:
                train_gs = train_gs.drop(row[0], axis=1)
            except ValueError:
                pass
            print "drop", row[0]
        feature_importances.pop(row[0], None)

greedy_remove_features(train_gs, feature_importances)









    



drop 0_6_female
drop young_female
drop market_count_500
drop cafe_count_500_price_4000
drop nuclear_reactor_raion_bool
drop work_male
drop radiation_raion_bool
drop 0_13_female
drop detention_facility_raion_bool
drop thermal_power_plant_raion_bool
drop work_female
drop ecology_index
drop 0_17_male
drop railroad_terminal_raion_bool
drop church_count_1500
drop big_road1_1line_bool
drop additional_education_raion
drop cafe_count_1000_price_4000
drop cafe_count_3000_price_1500
drop office_count_5000
drop children_school
drop cafe_avg_price_5000
drop build_count_1946-1970
drop school_education_centers_raion
drop build_count_foam
drop market_count_5000
drop cafe_count_3000_price_1000
drop cafe_sum_5000_max_price_avg
drop cafe_count_500_price_2500
drop cafe_count_500_price_1500






    



100%|██████████| 123/123 [20:40:45<00:00, 613.14s/it]

Получаем преобразованные train/test



In [168]:

    
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()









    Out[168]:







  
    
      
      id
      timestamp
      full_sq
      life_sq
      floor
      max_floor
      material
      build_year
      num_room
      kitch_sq
      state
      product_type
      sub_area
      area_m
      raion_popul
      green_zone_part
      indust_part
      children_preschool
      preschool_quota
      preschool_education_centers_raion
      children_school
      school_quota
      school_education_centers_raion
      school_education_centers_top_20_raion
      hospital_beds_raion
      healthcare_centers_raion
      university_top_20_raion
      sport_objects_raion
      additional_education_raion
      culture_objects_top_25
      culture_objects_top_25_raion
      shopping_centers_raion
      office_raion
      thermal_power_plant_raion
      incineration_raion
      oil_chemistry_raion
      radiation_raion
      railroad_terminal_raion
      big_market_raion
      nuclear_reactor_raion
      detention_facility_raion
      full_all
      male_f
      female_f
      young_all
      young_male
      young_female
      work_all
      work_male
      work_female
      ekder_all
      ekder_male
      ekder_female
      0_6_all
      0_6_male
      0_6_female
      7_14_all
      7_14_male
      7_14_female
      0_17_all
      0_17_male
      0_17_female
      16_29_all
      16_29_male
      16_29_female
      0_13_all
      0_13_male
      0_13_female
      raion_build_count_with_material_info
      build_count_block
      build_count_wood
      build_count_frame
      build_count_brick
      build_count_monolith
      build_count_panel
      build_count_foam
      build_count_slag
      build_count_mix
      raion_build_count_with_builddate_info
      build_count_before_1920
      build_count_1921-1945
      build_count_1946-1970
      build_count_1971-1995
      build_count_after_1995
      ID_metro
      metro_min_avto
      metro_km_avto
      metro_min_walk
      metro_km_walk
      kindergarten_km
      school_km
      park_km
      green_zone_km
      industrial_km
      water_treatment_km
      cemetery_km
      incineration_km
      railroad_station_walk_km
      railroad_station_walk_min
      ID_railroad_station_walk
      railroad_station_avto_km
      railroad_station_avto_min
      ID_railroad_station_avto
      public_transport_station_km
      public_transport_station_min_walk
      water_km
      water_1line
      mkad_km
      ttk_km
      sadovoe_km
      bulvar_ring_km
      kremlin_km
      big_road1_km
      ID_big_road1
      big_road1_1line
      big_road2_km
      ID_big_road2
      railroad_km
      railroad_1line
      zd_vokzaly_avto_km
      ID_railroad_terminal
      bus_terminal_avto_km
      ID_bus_terminal
      oil_chemistry_km
      nuclear_reactor_km
      radiation_km
      power_transmission_line_km
      thermal_power_plant_km
      ts_km
      big_market_km
      market_shop_km
      fitness_km
      swim_pool_km
      ice_rink_km
      stadium_km
      basketball_km
      hospice_morgue_km
      detention_facility_km
      public_healthcare_km
      university_km
      workplaces_km
      shopping_centers_km
      office_km
      additional_education_km
      preschool_km
      big_church_km
      church_synagogue_km
      mosque_km
      theater_km
      museum_km
      exhibition_km
      catering_km
      ecology
      green_part_500
      prom_part_500
      office_count_500
      office_sqm_500
      trc_count_500
      trc_sqm_500
      cafe_count_500
      cafe_sum_500_min_price_avg
      cafe_sum_500_max_price_avg
      cafe_avg_price_500
      cafe_count_500_na_price
      cafe_count_500_price_500
      cafe_count_500_price_1000
      cafe_count_500_price_1500
      cafe_count_500_price_2500
      cafe_count_500_price_4000
      cafe_count_500_price_high
      big_church_count_500
      church_count_500
      mosque_count_500
      leisure_count_500
      sport_count_500
      market_count_500
      green_part_1000
      prom_part_1000
      office_count_1000
      office_sqm_1000
      trc_count_1000
      trc_sqm_1000
      cafe_count_1000
      cafe_sum_1000_min_price_avg
      cafe_sum_1000_max_price_avg
      cafe_avg_price_1000
      cafe_count_1000_na_price
      cafe_count_1000_price_500
      cafe_count_1000_price_1000
      cafe_count_1000_price_1500
      cafe_count_1000_price_2500
      cafe_count_1000_price_4000
      cafe_count_1000_price_high
      big_church_count_1000
      church_count_1000
      mosque_count_1000
      leisure_count_1000
      sport_count_1000
      market_count_1000
      green_part_1500
      prom_part_1500
      office_count_1500
      office_sqm_1500
      trc_count_1500
      trc_sqm_1500
      cafe_count_1500
      cafe_sum_1500_min_price_avg
      cafe_sum_1500_max_price_avg
      cafe_avg_price_1500
      cafe_count_1500_na_price
      cafe_count_1500_price_500
      cafe_count_1500_price_1000
      cafe_count_1500_price_1500
      cafe_count_1500_price_2500
      cafe_count_1500_price_4000
      cafe_count_1500_price_high
      big_church_count_1500
      church_count_1500
      mosque_count_1500
      leisure_count_1500
      sport_count_1500
      market_count_1500
      green_part_2000
      prom_part_2000
      office_count_2000
      office_sqm_2000
      trc_count_2000
      trc_sqm_2000
      cafe_count_2000
      cafe_sum_2000_min_price_avg
      cafe_sum_2000_max_price_avg
      cafe_avg_price_2000
      cafe_count_2000_na_price
      cafe_count_2000_price_500
      cafe_count_2000_price_1000
      cafe_count_2000_price_1500
      cafe_count_2000_price_2500
      cafe_count_2000_price_4000
      cafe_count_2000_price_high
      big_church_count_2000
      church_count_2000
      mosque_count_2000
      leisure_count_2000
      sport_count_2000
      market_count_2000
      green_part_3000
      prom_part_3000
      office_count_3000
      office_sqm_3000
      trc_count_3000
      trc_sqm_3000
      cafe_count_3000
      cafe_sum_3000_min_price_avg
      cafe_sum_3000_max_price_avg
      cafe_avg_price_3000
      cafe_count_3000_na_price
      cafe_count_3000_price_500
      cafe_count_3000_price_1000
      cafe_count_3000_price_1500
      cafe_count_3000_price_2500
      cafe_count_3000_price_4000
      cafe_count_3000_price_high
      big_church_count_3000
      church_count_3000
      mosque_count_3000
      leisure_count_3000
      sport_count_3000
      market_count_3000
      green_part_5000
      prom_part_5000
      office_count_5000
      office_sqm_5000
      trc_count_5000
      trc_sqm_5000
      cafe_count_5000
      cafe_sum_5000_min_price_avg
      cafe_sum_5000_max_price_avg
      cafe_avg_price_5000
      cafe_count_5000_na_price
      cafe_count_5000_price_500
      cafe_count_5000_price_1000
      cafe_count_5000_price_1500
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      market_count_5000
      price_doc
    
  
  
    
      0
      1
      2011-08-20
      43
      27.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Bibirevo
      6.407578e+06
      155572
      0.189727
      0.000070
      9576
      5001.0
      5.0
      10309
      11065.0
      5
      0
      240.0
      1
      0
      7
      3
      no
      0
      16
      1
      no
      no
      no
      no
      no
      no
      no
      no
      86206
      40477
      45729
      21154
      11007
      10147
      98207
      52277
      45930
      36211
      10580
      25631
      9576
      4899
      4677
      10309
      5463
      4846
      23603
      12286
      11317
      17508
      9425
      8083
      18654
      9709
      8945
      211.0
      25.0
      0.0
      0.0
      0.0
      2.0
      184.0
      0.0
      0.0
      0.0
      211.0
      0.0
      0.0
      0.0
      206.0
      5.0
      1
      2.590241
      1.131260
      13.575119
      1.131260
      0.145700
      0.177975
      2.158587
      0.600973
      1.080934
      23.683460
      1.804127
      3.633334
      5.419893
      65.038716
      1.0
      5.419893
      6.905893
      1
      0.274985
      3.299822
      0.992631
      no
      1.422391
      10.918587
      13.100618
      13.675657
      15.156211
      1.422391
      1
      no
      3.830951
      5
      1.305159
      no
      14.231961
      101
      24.292406
      1
      18.152338
      5.718519
      1.210027
      1.062513
      5.814135
      4.308127
      10.814172
      1.676258
      0.485841
      3.065047
      1.107594
      8.148591
      3.516513
      2.392353
      4.248036
      0.974743
      6.715026
      0.884350
      0.648488
      0.637189
      0.947962
      0.177975
      0.625783
      0.628187
      3.932040
      14.053047
      7.389498
      7.023705
      0.516838
      good
      0.00
      0.00
      0
      0
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      7.36
      0.00
      1
      30500
      3
      55600
      19
      527.78
      888.89
      708.33
      1
      10
      4
      3
      1
      0
      0
      1
      2
      0
      0
      6
      1
      14.27
      6.92
      3
      39554
      9
      171420
      34
      566.67
      969.70
      768.18
      1
      14
      11
      6
      2
      0
      0
      1
      2
      0
      0
      7
      1
      11.77
      15.97
      9
      188854
      19
      1244891
      36
      614.29
      1042.86
      828.57
      1
      15
      11
      6
      2
      1
      0
      1
      2
      0
      0
      10
      1
      11.98
      13.55
      12
      251554
      23
      1419204
      68
      639.68
      1079.37
      859.52
      5
      21
      22
      16
      3
      1
      0
      2
      4
      0
      0
      21
      1
      13.09
      13.31
      29
      807385
      52
      4036616
      152
      708.57
      1185.71
      947.14
      12
      39
      48
      40
      9
      4
      0
      13
      22
      1
      0
      52
      4
      5850000
    
    
      1
      2
      2011-08-23
      34
      19.0
      3.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Nagatinskij Zaton
      9.589337e+06
      115352
      0.372602
      0.049637
      6880
      3119.0
      5.0
      7759
      6237.0
      8
      0
      229.0
      1
      0
      6
      1
      yes
      1
      3
      0
      no
      no
      no
      no
      no
      no
      no
      no
      76284
      34200
      42084
      15727
      7925
      7802
      70194
      35622
      34572
      29431
      9266
      20165
      6880
      3466
      3414
      7759
      3909
      3850
      17700
      8998
      8702
      15164
      7571
      7593
      13729
      6929
      6800
      245.0
      83.0
      1.0
      0.0
      67.0
      4.0
      90.0
      0.0
      0.0
      0.0
      244.0
      1.0
      1.0
      143.0
      84.0
      15.0
      2
      0.936700
      0.647337
      7.620630
      0.635053
      0.147754
      0.273345
      0.550690
      0.065321
      0.966479
      1.317476
      4.655004
      8.648587
      3.411993
      40.943917
      2.0
      3.641773
      4.679745
      2
      0.065263
      0.783160
      0.698081
      no
      9.503405
      3.103996
      6.444333
      8.132640
      8.698054
      2.887377
      2
      no
      3.103996
      4
      0.694536
      no
      9.242586
      32
      5.706113
      2
      9.034642
      3.489954
      2.724295
      1.246149
      3.419574
      0.725560
      6.910568
      3.424716
      0.668364
      2.000154
      8.972823
      6.127073
      1.161579
      2.543747
      12.649879
      1.477723
      1.852560
      0.686252
      0.519311
      0.688796
      1.072315
      0.273345
      0.967821
      0.471447
      4.841544
      6.829889
      0.709260
      2.358840
      0.230287
      excellent
      25.14
      0.00
      0
      0
      0
      0
      5
      860.00
      1500.00
      1180.00
      0
      1
      3
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      26.66
      0.07
      2
      86600
      5
      94065
      13
      615.38
      1076.92
      846.15
      0
      5
      6
      1
      0
      1
      0
      1
      2
      0
      4
      2
      0
      21.53
      7.71
      3
      102910
      7
      127065
      17
      694.12
      1205.88
      950.00
      0
      6
      7
      1
      2
      1
      0
      1
      5
      0
      4
      9
      0
      22.37
      19.25
      4
      165510
      8
      179065
      21
      695.24
      1190.48
      942.86
      0
      7
      8
      3
      2
      1
      0
      1
      5
      0
      4
      11
      0
      18.07
      27.32
      12
      821986
      14
      491565
      30
      631.03
      1086.21
      858.62
      1
      11
      11
      4
      2
      1
      0
      1
      7
      0
      6
      19
      1
      10.26
      27.47
      66
      2690465
      40
      2034942
      177
      673.81
      1148.81
      911.31
      9
      49
      65
      36
      15
      3
      0
      15
      29
      1
      10
      66
      14
      6000000
    
    
      2
      3
      2011-08-27
      43
      29.0
      2.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Tekstil'shhiki
      4.808270e+06
      101708
      0.112560
      0.118537
      5879
      1463.0
      4.0
      6207
      5580.0
      7
      0
      1183.0
      1
      0
      5
      1
      no
      0
      0
      1
      no
      no
      no
      yes
      no
      no
      no
      no
      101982
      46076
      55906
      13028
      6835
      6193
      63388
      31813
      31575
      25292
      7609
      17683
      5879
      3095
      2784
      6207
      3269
      2938
      14884
      7821
      7063
      19401
      9045
      10356
      11252
      5916
      5336
      330.0
      59.0
      0.0
      0.0
      206.0
      4.0
      60.0
      0.0
      1.0
      0.0
      330.0
      1.0
      0.0
      246.0
      63.0
      20.0
      3
      2.120999
      1.637996
      17.351515
      1.445960
      0.049102
      0.158072
      0.374848
      0.453172
      0.939275
      4.912660
      3.381083
      11.996480
      1.277658
      15.331896
      3.0
      1.277658
      1.701420
      3
      0.328756
      3.945073
      0.468265
      no
      5.604800
      2.927487
      6.963403
      8.054252
      9.067885
      0.647250
      3
      no
      2.927487
      4
      0.700691
      no
      9.540544
      5
      6.710302
      3
      5.777394
      7.506612
      0.772216
      1.602183
      3.682455
      3.562188
      5.752368
      1.375443
      0.733101
      1.239304
      1.978517
      0.767569
      1.952771
      0.621357
      7.682303
      0.097144
      0.841254
      1.510089
      1.486533
      1.543049
      0.391957
      0.158072
      3.178751
      0.755946
      7.922152
      4.273200
      3.156423
      4.958214
      0.190462
      poor
      1.67
      0.00
      0
      0
      0
      0
      3
      666.67
      1166.67
      916.67
      0
      0
      2
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      4.99
      0.29
      0
      0
      0
      0
      9
      642.86
      1142.86
      892.86
      2
      0
      5
      2
      0
      0
      0
      0
      1
      0
      0
      5
      3
      9.92
      6.73
      0
      0
      1
      2600
      14
      516.67
      916.67
      716.67
      2
      4
      6
      2
      0
      0
      0
      0
      4
      0
      0
      6
      5
      12.99
      12.75
      4
      100200
      7
      52550
      24
      563.64
      977.27
      770.45
      2
      8
      9
      4
      1
      0
      0
      0
      4
      0
      0
      8
      5
      12.14
      26.46
      8
      110856
      7
      52550
      41
      697.44
      1192.31
      944.87
      2
      9
      17
      9
      3
      1
      0
      0
      11
      0
      0
      20
      6
      13.69
      21.58
      43
      1478160
      35
      1572990
      122
      702.68
      1196.43
      949.55
      10
      29
      45
      25
      10
      3
      0
      11
      27
      0
      4
      67
      10
      5700000
    
    
      3
      4
      2011-09-01
      89
      50.0
      9.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Mitino
      1.258354e+07
      178473
      0.194703
      0.069753
      13087
      6839.0
      9.0
      13670
      17063.0
      10
      0
      NaN
      1
      0
      17
      6
      no
      0
      11
      4
      no
      no
      no
      no
      no
      no
      no
      no
      21155
      9828
      11327
      28563
      14680
      13883
      120381
      60040
      60341
      29529
      9083
      20446
      13087
      6645
      6442
      13670
      7126
      6544
      32063
      16513
      15550
      3292
      1450
      1842
      24934
      12782
      12152
      458.0
      9.0
      51.0
      12.0
      124.0
      50.0
      201.0
      0.0
      9.0
      2.0
      459.0
      13.0
      24.0
      40.0
      130.0
      252.0
      4
      1.489049
      0.984537
      11.565624
      0.963802
      0.179441
      0.236455
      0.078090
      0.106125
      0.451173
      15.623710
      2.017080
      14.317640
      4.291432
      51.497190
      4.0
      3.816045
      5.271136
      4
      0.131597
      1.579164
      1.200336
      no
      2.677824
      14.606501
      17.457198
      18.309433
      19.487005
      2.677824
      1
      no
      2.780449
      17
      1.999265
      no
      17.478380
      83
      6.734618
      1
      27.667863
      9.522538
      6.348716
      1.767612
      11.178333
      0.583025
      27.892717
      0.811275
      0.623484
      1.950317
      6.483172
      7.385521
      4.923843
      3.549558
      8.789894
      2.163735
      10.903161
      0.622272
      0.599914
      0.934273
      0.892674
      0.236455
      1.031777
      1.561505
      15.300449
      16.990677
      16.041521
      5.029696
      0.465820
      good
      17.36
      0.57
      0
      0
      0
      0
      2
      1000.00
      1500.00
      1250.00
      0
      0
      0
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      19.25
      10.35
      1
      11000
      6
      80780
      12
      658.33
      1083.33
      870.83
      0
      3
      4
      5
      0
      0
      0
      0
      0
      0
      0
      3
      1
      28.38
      6.57
      2
      11000
      7
      89492
      23
      673.91
      1130.43
      902.17
      0
      5
      9
      8
      1
      0
      0
      1
      0
      0
      0
      9
      2
      32.29
      5.73
      2
      11000
      7
      89492
      25
      660.00
      1120.00
      890.00
      0
      5
      11
      8
      1
      0
      0
      1
      1
      0
      0
      13
      2
      20.79
      3.57
      4
      167000
      12
      205756
      32
      718.75
      1218.75
      968.75
      0
      5
      14
      10
      3
      0
      0
      1
      2
      0
      0
      18
      3
      14.18
      3.89
      8
      244166
      22
      942180
      61
      931.58
      1552.63
      1242.11
      4
      7
      21
      15
      11
      2
      1
      4
      4
      0
      0
      26
      3
      13100000
    
    
      4
      5
      2011-09-05
      77
      77.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Basmannoe
      8.398461e+06
      108171
      0.015234
      0.037316
      5706
      3240.0
      7.0
      6748
      7770.0
      9
      0
      562.0
      4
      2
      25
      2
      no
      0
      10
      93
      no
      no
      no
      yes
      yes
      no
      no
      no
      28179
      13522
      14657
      13368
      7159
      6209
      68043
      34236
      33807
      26760
      8563
      18197
      5706
      2982
      2724
      6748
      3664
      3084
      15237
      8113
      7124
      5164
      2583
      2581
      11631
      6223
      5408
      746.0
      48.0
      0.0
      0.0
      643.0
      16.0
      35.0
      0.0
      3.0
      1.0
      746.0
      371.0
      114.0
      146.0
      62.0
      53.0
      5
      1.257186
      0.876620
      8.266305
      0.688859
      0.247901
      0.376838
      0.258289
      0.236214
      0.392871
      10.683540
      2.936581
      11.903910
      0.853960
      10.247521
      5.0
      1.595898
      2.156284
      113
      0.071480
      0.857764
      0.820294
      no
      11.616653
      1.721834
      0.046810
      0.787593
      2.578671
      1.721834
      4
      no
      3.133531
      10
      0.084113
      yes
      1.595898
      113
      1.423428
      4
      6.515857
      8.671016
      1.638318
      3.632640
      4.587917
      2.609420
      9.155057
      1.969738
      0.220288
      2.544696
      3.975401
      3.610754
      0.307915
      1.864637
      3.779781
      1.121703
      0.991683
      0.892668
      0.429052
      0.077901
      0.810801
      0.376838
      0.378756
      0.121681
      2.584370
      1.112486
      1.800125
      1.339652
      0.026102
      excellent
      3.56
      4.44
      15
      293699
      1
      45000
      48
      702.22
      1166.67
      934.44
      3
      17
      10
      11
      7
      0
      0
      1
      4
      0
      2
      3
      0
      3.34
      8.29
      46
      420952
      3
      158200
      153
      763.45
      1272.41
      1017.93
      8
      39
      45
      39
      19
      2
      1
      7
      12
      0
      6
      7
      0
      4.12
      4.83
      93
      1195735
      9
      445900
      272
      766.80
      1272.73
      1019.76
      19
      70
      74
      72
      30
      6
      1
      18
      30
      0
      10
      14
      2
      4.53
      5.02
      149
      1625130
      17
      564843
      483
      765.93
      1269.23
      1017.58
      28
      130
      129
      131
      50
      14
      1
      35
      61
      0
      17
      21
      3
      5.06
      8.62
      305
      3420907
      60
      2296870
      1068
      853.03
      1410.45
      1131.74
      63
      266
      267
      262
      149
      57
      4
      70
      121
      1
      40
      77
      5
      8.38
      10.92
      689
      8404624
      114
      3503058
      2283
      853.88
      1411.45
      1132.66
      143
      566
      578
      552
      319
      108
      17
      135
      236
      2
      91
      195
      14
      16331452



In [169]:

    
train_new_pr = feature_exclude(preprocess_categorial(preprocess(train_raw, dropid=False)))
test_new_pr = feature_exclude(preprocess_categorial(preprocess(test, dropid=False)))

# нужно сделать fillna, чтобы получить филлеры для NA из моделей
filled_train = fill_na_xgb(train_new_pr)
filled_test = fill_na_xgb(test_new_pr)

filled_train = filled_train.set_index("id")
filled_test = filled_test.set_index("id")



In [175]:

    
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()









    Out[175]:







  
    
      
      id
      timestamp
      full_sq
      life_sq
      floor
      max_floor
      material
      build_year
      num_room
      kitch_sq
      state
      product_type
      sub_area
      area_m
      raion_popul
      green_zone_part
      indust_part
      children_preschool
      preschool_quota
      preschool_education_centers_raion
      children_school
      school_quota
      school_education_centers_raion
      school_education_centers_top_20_raion
      hospital_beds_raion
      healthcare_centers_raion
      university_top_20_raion
      sport_objects_raion
      additional_education_raion
      culture_objects_top_25
      culture_objects_top_25_raion
      shopping_centers_raion
      office_raion
      thermal_power_plant_raion
      incineration_raion
      oil_chemistry_raion
      radiation_raion
      railroad_terminal_raion
      big_market_raion
      nuclear_reactor_raion
      detention_facility_raion
      full_all
      male_f
      female_f
      young_all
      young_male
      young_female
      work_all
      work_male
      work_female
      ekder_all
      ekder_male
      ekder_female
      0_6_all
      0_6_male
      0_6_female
      7_14_all
      7_14_male
      7_14_female
      0_17_all
      0_17_male
      0_17_female
      16_29_all
      16_29_male
      16_29_female
      0_13_all
      0_13_male
      0_13_female
      raion_build_count_with_material_info
      build_count_block
      build_count_wood
      build_count_frame
      build_count_brick
      build_count_monolith
      build_count_panel
      build_count_foam
      build_count_slag
      build_count_mix
      raion_build_count_with_builddate_info
      build_count_before_1920
      build_count_1921-1945
      build_count_1946-1970
      build_count_1971-1995
      build_count_after_1995
      ID_metro
      metro_min_avto
      metro_km_avto
      metro_min_walk
      metro_km_walk
      kindergarten_km
      school_km
      park_km
      green_zone_km
      industrial_km
      water_treatment_km
      cemetery_km
      incineration_km
      railroad_station_walk_km
      railroad_station_walk_min
      ID_railroad_station_walk
      railroad_station_avto_km
      railroad_station_avto_min
      ID_railroad_station_avto
      public_transport_station_km
      public_transport_station_min_walk
      water_km
      water_1line
      mkad_km
      ttk_km
      sadovoe_km
      bulvar_ring_km
      kremlin_km
      big_road1_km
      ID_big_road1
      big_road1_1line
      big_road2_km
      ID_big_road2
      railroad_km
      railroad_1line
      zd_vokzaly_avto_km
      ID_railroad_terminal
      bus_terminal_avto_km
      ID_bus_terminal
      oil_chemistry_km
      nuclear_reactor_km
      radiation_km
      power_transmission_line_km
      thermal_power_plant_km
      ts_km
      big_market_km
      market_shop_km
      fitness_km
      swim_pool_km
      ice_rink_km
      stadium_km
      basketball_km
      hospice_morgue_km
      detention_facility_km
      public_healthcare_km
      university_km
      workplaces_km
      shopping_centers_km
      office_km
      additional_education_km
      preschool_km
      big_church_km
      church_synagogue_km
      mosque_km
      theater_km
      museum_km
      exhibition_km
      catering_km
      ecology
      green_part_500
      prom_part_500
      office_count_500
      office_sqm_500
      trc_count_500
      trc_sqm_500
      cafe_count_500
      cafe_sum_500_min_price_avg
      cafe_sum_500_max_price_avg
      cafe_avg_price_500
      cafe_count_500_na_price
      cafe_count_500_price_500
      cafe_count_500_price_1000
      cafe_count_500_price_1500
      cafe_count_500_price_2500
      cafe_count_500_price_4000
      cafe_count_500_price_high
      big_church_count_500
      church_count_500
      mosque_count_500
      leisure_count_500
      sport_count_500
      market_count_500
      green_part_1000
      prom_part_1000
      office_count_1000
      office_sqm_1000
      trc_count_1000
      trc_sqm_1000
      cafe_count_1000
      cafe_sum_1000_min_price_avg
      cafe_sum_1000_max_price_avg
      cafe_avg_price_1000
      cafe_count_1000_na_price
      cafe_count_1000_price_500
      cafe_count_1000_price_1000
      cafe_count_1000_price_1500
      cafe_count_1000_price_2500
      cafe_count_1000_price_4000
      cafe_count_1000_price_high
      big_church_count_1000
      church_count_1000
      mosque_count_1000
      leisure_count_1000
      sport_count_1000
      market_count_1000
      green_part_1500
      prom_part_1500
      office_count_1500
      office_sqm_1500
      trc_count_1500
      trc_sqm_1500
      cafe_count_1500
      cafe_sum_1500_min_price_avg
      cafe_sum_1500_max_price_avg
      cafe_avg_price_1500
      cafe_count_1500_na_price
      cafe_count_1500_price_500
      cafe_count_1500_price_1000
      cafe_count_1500_price_1500
      cafe_count_1500_price_2500
      cafe_count_1500_price_4000
      cafe_count_1500_price_high
      big_church_count_1500
      church_count_1500
      mosque_count_1500
      leisure_count_1500
      sport_count_1500
      market_count_1500
      green_part_2000
      prom_part_2000
      office_count_2000
      office_sqm_2000
      trc_count_2000
      trc_sqm_2000
      cafe_count_2000
      cafe_sum_2000_min_price_avg
      cafe_sum_2000_max_price_avg
      cafe_avg_price_2000
      cafe_count_2000_na_price
      cafe_count_2000_price_500
      cafe_count_2000_price_1000
      cafe_count_2000_price_1500
      cafe_count_2000_price_2500
      cafe_count_2000_price_4000
      cafe_count_2000_price_high
      big_church_count_2000
      church_count_2000
      mosque_count_2000
      leisure_count_2000
      sport_count_2000
      market_count_2000
      green_part_3000
      prom_part_3000
      office_count_3000
      office_sqm_3000
      trc_count_3000
      trc_sqm_3000
      cafe_count_3000
      cafe_sum_3000_min_price_avg
      cafe_sum_3000_max_price_avg
      cafe_avg_price_3000
      cafe_count_3000_na_price
      cafe_count_3000_price_500
      cafe_count_3000_price_1000
      cafe_count_3000_price_1500
      cafe_count_3000_price_2500
      cafe_count_3000_price_4000
      cafe_count_3000_price_high
      big_church_count_3000
      church_count_3000
      mosque_count_3000
      leisure_count_3000
      sport_count_3000
      market_count_3000
      green_part_5000
      prom_part_5000
      office_count_5000
      office_sqm_5000
      trc_count_5000
      trc_sqm_5000
      cafe_count_5000
      cafe_sum_5000_min_price_avg
      cafe_sum_5000_max_price_avg
      cafe_avg_price_5000
      cafe_count_5000_na_price
      cafe_count_5000_price_500
      cafe_count_5000_price_1000
      cafe_count_5000_price_1500
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      market_count_5000
      price_doc
    
  
  
    
      0
      1
      2011-08-20
      43
      27.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Bibirevo
      6.407578e+06
      155572
      0.189727
      0.000070
      9576
      5001.0
      5.0
      10309
      11065.0
      5
      0
      240.0
      1
      0
      7
      3
      no
      0
      16
      1
      no
      no
      no
      no
      no
      no
      no
      no
      86206
      40477
      45729
      21154
      11007
      10147
      98207
      52277
      45930
      36211
      10580
      25631
      9576
      4899
      4677
      10309
      5463
      4846
      23603
      12286
      11317
      17508
      9425
      8083
      18654
      9709
      8945
      211.0
      25.0
      0.0
      0.0
      0.0
      2.0
      184.0
      0.0
      0.0
      0.0
      211.0
      0.0
      0.0
      0.0
      206.0
      5.0
      1
      2.590241
      1.131260
      13.575119
      1.131260
      0.145700
      0.177975
      2.158587
      0.600973
      1.080934
      23.683460
      1.804127
      3.633334
      5.419893
      65.038716
      1.0
      5.419893
      6.905893
      1
      0.274985
      3.299822
      0.992631
      no
      1.422391
      10.918587
      13.100618
      13.675657
      15.156211
      1.422391
      1
      no
      3.830951
      5
      1.305159
      no
      14.231961
      101
      24.292406
      1
      18.152338
      5.718519
      1.210027
      1.062513
      5.814135
      4.308127
      10.814172
      1.676258
      0.485841
      3.065047
      1.107594
      8.148591
      3.516513
      2.392353
      4.248036
      0.974743
      6.715026
      0.884350
      0.648488
      0.637189
      0.947962
      0.177975
      0.625783
      0.628187
      3.932040
      14.053047
      7.389498
      7.023705
      0.516838
      good
      0.00
      0.00
      0
      0
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      7.36
      0.00
      1
      30500
      3
      55600
      19
      527.78
      888.89
      708.33
      1
      10
      4
      3
      1
      0
      0
      1
      2
      0
      0
      6
      1
      14.27
      6.92
      3
      39554
      9
      171420
      34
      566.67
      969.70
      768.18
      1
      14
      11
      6
      2
      0
      0
      1
      2
      0
      0
      7
      1
      11.77
      15.97
      9
      188854
      19
      1244891
      36
      614.29
      1042.86
      828.57
      1
      15
      11
      6
      2
      1
      0
      1
      2
      0
      0
      10
      1
      11.98
      13.55
      12
      251554
      23
      1419204
      68
      639.68
      1079.37
      859.52
      5
      21
      22
      16
      3
      1
      0
      2
      4
      0
      0
      21
      1
      13.09
      13.31
      29
      807385
      52
      4036616
      152
      708.57
      1185.71
      947.14
      12
      39
      48
      40
      9
      4
      0
      13
      22
      1
      0
      52
      4
      5850000
    
    
      1
      2
      2011-08-23
      34
      19.0
      3.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Nagatinskij Zaton
      9.589337e+06
      115352
      0.372602
      0.049637
      6880
      3119.0
      5.0
      7759
      6237.0
      8
      0
      229.0
      1
      0
      6
      1
      yes
      1
      3
      0
      no
      no
      no
      no
      no
      no
      no
      no
      76284
      34200
      42084
      15727
      7925
      7802
      70194
      35622
      34572
      29431
      9266
      20165
      6880
      3466
      3414
      7759
      3909
      3850
      17700
      8998
      8702
      15164
      7571
      7593
      13729
      6929
      6800
      245.0
      83.0
      1.0
      0.0
      67.0
      4.0
      90.0
      0.0
      0.0
      0.0
      244.0
      1.0
      1.0
      143.0
      84.0
      15.0
      2
      0.936700
      0.647337
      7.620630
      0.635053
      0.147754
      0.273345
      0.550690
      0.065321
      0.966479
      1.317476
      4.655004
      8.648587
      3.411993
      40.943917
      2.0
      3.641773
      4.679745
      2
      0.065263
      0.783160
      0.698081
      no
      9.503405
      3.103996
      6.444333
      8.132640
      8.698054
      2.887377
      2
      no
      3.103996
      4
      0.694536
      no
      9.242586
      32
      5.706113
      2
      9.034642
      3.489954
      2.724295
      1.246149
      3.419574
      0.725560
      6.910568
      3.424716
      0.668364
      2.000154
      8.972823
      6.127073
      1.161579
      2.543747
      12.649879
      1.477723
      1.852560
      0.686252
      0.519311
      0.688796
      1.072315
      0.273345
      0.967821
      0.471447
      4.841544
      6.829889
      0.709260
      2.358840
      0.230287
      excellent
      25.14
      0.00
      0
      0
      0
      0
      5
      860.00
      1500.00
      1180.00
      0
      1
      3
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      26.66
      0.07
      2
      86600
      5
      94065
      13
      615.38
      1076.92
      846.15
      0
      5
      6
      1
      0
      1
      0
      1
      2
      0
      4
      2
      0
      21.53
      7.71
      3
      102910
      7
      127065
      17
      694.12
      1205.88
      950.00
      0
      6
      7
      1
      2
      1
      0
      1
      5
      0
      4
      9
      0
      22.37
      19.25
      4
      165510
      8
      179065
      21
      695.24
      1190.48
      942.86
      0
      7
      8
      3
      2
      1
      0
      1
      5
      0
      4
      11
      0
      18.07
      27.32
      12
      821986
      14
      491565
      30
      631.03
      1086.21
      858.62
      1
      11
      11
      4
      2
      1
      0
      1
      7
      0
      6
      19
      1
      10.26
      27.47
      66
      2690465
      40
      2034942
      177
      673.81
      1148.81
      911.31
      9
      49
      65
      36
      15
      3
      0
      15
      29
      1
      10
      66
      14
      6000000
    
    
      2
      3
      2011-08-27
      43
      29.0
      2.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Tekstil'shhiki
      4.808270e+06
      101708
      0.112560
      0.118537
      5879
      1463.0
      4.0
      6207
      5580.0
      7
      0
      1183.0
      1
      0
      5
      1
      no
      0
      0
      1
      no
      no
      no
      yes
      no
      no
      no
      no
      101982
      46076
      55906
      13028
      6835
      6193
      63388
      31813
      31575
      25292
      7609
      17683
      5879
      3095
      2784
      6207
      3269
      2938
      14884
      7821
      7063
      19401
      9045
      10356
      11252
      5916
      5336
      330.0
      59.0
      0.0
      0.0
      206.0
      4.0
      60.0
      0.0
      1.0
      0.0
      330.0
      1.0
      0.0
      246.0
      63.0
      20.0
      3
      2.120999
      1.637996
      17.351515
      1.445960
      0.049102
      0.158072
      0.374848
      0.453172
      0.939275
      4.912660
      3.381083
      11.996480
      1.277658
      15.331896
      3.0
      1.277658
      1.701420
      3
      0.328756
      3.945073
      0.468265
      no
      5.604800
      2.927487
      6.963403
      8.054252
      9.067885
      0.647250
      3
      no
      2.927487
      4
      0.700691
      no
      9.540544
      5
      6.710302
      3
      5.777394
      7.506612
      0.772216
      1.602183
      3.682455
      3.562188
      5.752368
      1.375443
      0.733101
      1.239304
      1.978517
      0.767569
      1.952771
      0.621357
      7.682303
      0.097144
      0.841254
      1.510089
      1.486533
      1.543049
      0.391957
      0.158072
      3.178751
      0.755946
      7.922152
      4.273200
      3.156423
      4.958214
      0.190462
      poor
      1.67
      0.00
      0
      0
      0
      0
      3
      666.67
      1166.67
      916.67
      0
      0
      2
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      4.99
      0.29
      0
      0
      0
      0
      9
      642.86
      1142.86
      892.86
      2
      0
      5
      2
      0
      0
      0
      0
      1
      0
      0
      5
      3
      9.92
      6.73
      0
      0
      1
      2600
      14
      516.67
      916.67
      716.67
      2
      4
      6
      2
      0
      0
      0
      0
      4
      0
      0
      6
      5
      12.99
      12.75
      4
      100200
      7
      52550
      24
      563.64
      977.27
      770.45
      2
      8
      9
      4
      1
      0
      0
      0
      4
      0
      0
      8
      5
      12.14
      26.46
      8
      110856
      7
      52550
      41
      697.44
      1192.31
      944.87
      2
      9
      17
      9
      3
      1
      0
      0
      11
      0
      0
      20
      6
      13.69
      21.58
      43
      1478160
      35
      1572990
      122
      702.68
      1196.43
      949.55
      10
      29
      45
      25
      10
      3
      0
      11
      27
      0
      4
      67
      10
      5700000
    
    
      3
      4
      2011-09-01
      89
      50.0
      9.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Mitino
      1.258354e+07
      178473
      0.194703
      0.069753
      13087
      6839.0
      9.0
      13670
      17063.0
      10
      0
      NaN
      1
      0
      17
      6
      no
      0
      11
      4
      no
      no
      no
      no
      no
      no
      no
      no
      21155
      9828
      11327
      28563
      14680
      13883
      120381
      60040
      60341
      29529
      9083
      20446
      13087
      6645
      6442
      13670
      7126
      6544
      32063
      16513
      15550
      3292
      1450
      1842
      24934
      12782
      12152
      458.0
      9.0
      51.0
      12.0
      124.0
      50.0
      201.0
      0.0
      9.0
      2.0
      459.0
      13.0
      24.0
      40.0
      130.0
      252.0
      4
      1.489049
      0.984537
      11.565624
      0.963802
      0.179441
      0.236455
      0.078090
      0.106125
      0.451173
      15.623710
      2.017080
      14.317640
      4.291432
      51.497190
      4.0
      3.816045
      5.271136
      4
      0.131597
      1.579164
      1.200336
      no
      2.677824
      14.606501
      17.457198
      18.309433
      19.487005
      2.677824
      1
      no
      2.780449
      17
      1.999265
      no
      17.478380
      83
      6.734618
      1
      27.667863
      9.522538
      6.348716
      1.767612
      11.178333
      0.583025
      27.892717
      0.811275
      0.623484
      1.950317
      6.483172
      7.385521
      4.923843
      3.549558
      8.789894
      2.163735
      10.903161
      0.622272
      0.599914
      0.934273
      0.892674
      0.236455
      1.031777
      1.561505
      15.300449
      16.990677
      16.041521
      5.029696
      0.465820
      good
      17.36
      0.57
      0
      0
      0
      0
      2
      1000.00
      1500.00
      1250.00
      0
      0
      0
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      19.25
      10.35
      1
      11000
      6
      80780
      12
      658.33
      1083.33
      870.83
      0
      3
      4
      5
      0
      0
      0
      0
      0
      0
      0
      3
      1
      28.38
      6.57
      2
      11000
      7
      89492
      23
      673.91
      1130.43
      902.17
      0
      5
      9
      8
      1
      0
      0
      1
      0
      0
      0
      9
      2
      32.29
      5.73
      2
      11000
      7
      89492
      25
      660.00
      1120.00
      890.00
      0
      5
      11
      8
      1
      0
      0
      1
      1
      0
      0
      13
      2
      20.79
      3.57
      4
      167000
      12
      205756
      32
      718.75
      1218.75
      968.75
      0
      5
      14
      10
      3
      0
      0
      1
      2
      0
      0
      18
      3
      14.18
      3.89
      8
      244166
      22
      942180
      61
      931.58
      1552.63
      1242.11
      4
      7
      21
      15
      11
      2
      1
      4
      4
      0
      0
      26
      3
      13100000
    
    
      4
      5
      2011-09-05
      77
      77.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Basmannoe
      8.398461e+06
      108171
      0.015234
      0.037316
      5706
      3240.0
      7.0
      6748
      7770.0
      9
      0
      562.0
      4
      2
      25
      2
      no
      0
      10
      93
      no
      no
      no
      yes
      yes
      no
      no
      no
      28179
      13522
      14657
      13368
      7159
      6209
      68043
      34236
      33807
      26760
      8563
      18197
      5706
      2982
      2724
      6748
      3664
      3084
      15237
      8113
      7124
      5164
      2583
      2581
      11631
      6223
      5408
      746.0
      48.0
      0.0
      0.0
      643.0
      16.0
      35.0
      0.0
      3.0
      1.0
      746.0
      371.0
      114.0
      146.0
      62.0
      53.0
      5
      1.257186
      0.876620
      8.266305
      0.688859
      0.247901
      0.376838
      0.258289
      0.236214
      0.392871
      10.683540
      2.936581
      11.903910
      0.853960
      10.247521
      5.0
      1.595898
      2.156284
      113
      0.071480
      0.857764
      0.820294
      no
      11.616653
      1.721834
      0.046810
      0.787593
      2.578671
      1.721834
      4
      no
      3.133531
      10
      0.084113
      yes
      1.595898
      113
      1.423428
      4
      6.515857
      8.671016
      1.638318
      3.632640
      4.587917
      2.609420
      9.155057
      1.969738
      0.220288
      2.544696
      3.975401
      3.610754
      0.307915
      1.864637
      3.779781
      1.121703
      0.991683
      0.892668
      0.429052
      0.077901
      0.810801
      0.376838
      0.378756
      0.121681
      2.584370
      1.112486
      1.800125
      1.339652
      0.026102
      excellent
      3.56
      4.44
      15
      293699
      1
      45000
      48
      702.22
      1166.67
      934.44
      3
      17
      10
      11
      7
      0
      0
      1
      4
      0
      2
      3
      0
      3.34
      8.29
      46
      420952
      3
      158200
      153
      763.45
      1272.41
      1017.93
      8
      39
      45
      39
      19
      2
      1
      7
      12
      0
      6
      7
      0
      4.12
      4.83
      93
      1195735
      9
      445900
      272
      766.80
      1272.73
      1019.76
      19
      70
      74
      72
      30
      6
      1
      18
      30
      0
      10
      14
      2
      4.53
      5.02
      149
      1625130
      17
      564843
      483
      765.93
      1269.23
      1017.58
      28
      130
      129
      131
      50
      14
      1
      35
      61
      0
      17
      21
      3
      5.06
      8.62
      305
      3420907
      60
      2296870
      1068
      853.03
      1410.45
      1131.74
      63
      266
      267
      262
      149
      57
      4
      70
      121
      1
      40
      77
      5
      8.38
      10.92
      689
      8404624
      114
      3503058
      2283
      853.88
      1411.45
      1132.66
      143
      566
      578
      552
      319
      108
      17
      135
      236
      2
      91
      195
      14
      16331452



In [176]:

    
train_new = preprocess_anomaly(train_raw)
test_new = preprocess_anomaly(test)

train_new = train_new.set_index("id")
test_new = test_new.set_index("id")

train_new = train_new.join(filled_train[important_feats], rsuffix="_filled")
test_new = test_new.join(filled_test[important_feats], rsuffix="_filled")
for impf in important_feats:
    train_new[impf] = train_new[impf].fillna(train_new["%s_filled" % impf])
    train_new = train_new.drop(["%s_filled" % impf], axis=1)
    test_new[impf] = test_new[impf].fillna(test_new["%s_filled" % impf])
    test_new = test_new.drop(["%s_filled" % impf], axis=1)



In [177]:

    
# train_new = feature_exclude(train_new)
# test_new = feature_exclude(test_new)



In [178]:

    
train_new.to_csv("data/train_cleaned.csv", encoding="utf_8")
test_new.to_csv("data/test_cleaned.csv", encoding="utf_8")

Fix from Sberbank



In [259]:

    
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()









    Out[259]:







  
    
      
      id
      timestamp
      full_sq
      life_sq
      floor
      max_floor
      material
      build_year
      num_room
      kitch_sq
      state
      product_type
      sub_area
      area_m
      raion_popul
      green_zone_part
      indust_part
      children_preschool
      preschool_quota
      preschool_education_centers_raion
      children_school
      school_quota
      school_education_centers_raion
      school_education_centers_top_20_raion
      hospital_beds_raion
      healthcare_centers_raion
      university_top_20_raion
      sport_objects_raion
      additional_education_raion
      culture_objects_top_25
      culture_objects_top_25_raion
      shopping_centers_raion
      office_raion
      thermal_power_plant_raion
      incineration_raion
      oil_chemistry_raion
      radiation_raion
      railroad_terminal_raion
      big_market_raion
      nuclear_reactor_raion
      detention_facility_raion
      full_all
      male_f
      female_f
      young_all
      young_male
      young_female
      work_all
      work_male
      work_female
      ekder_all
      ekder_male
      ekder_female
      0_6_all
      0_6_male
      0_6_female
      7_14_all
      7_14_male
      7_14_female
      0_17_all
      0_17_male
      0_17_female
      16_29_all
      16_29_male
      16_29_female
      0_13_all
      0_13_male
      0_13_female
      raion_build_count_with_material_info
      build_count_block
      build_count_wood
      build_count_frame
      build_count_brick
      build_count_monolith
      build_count_panel
      build_count_foam
      build_count_slag
      build_count_mix
      raion_build_count_with_builddate_info
      build_count_before_1920
      build_count_1921-1945
      build_count_1946-1970
      build_count_1971-1995
      build_count_after_1995
      ID_metro
      metro_min_avto
      metro_km_avto
      metro_min_walk
      metro_km_walk
      kindergarten_km
      school_km
      park_km
      green_zone_km
      industrial_km
      water_treatment_km
      cemetery_km
      incineration_km
      railroad_station_walk_km
      railroad_station_walk_min
      ID_railroad_station_walk
      railroad_station_avto_km
      railroad_station_avto_min
      ID_railroad_station_avto
      public_transport_station_km
      public_transport_station_min_walk
      water_km
      water_1line
      mkad_km
      ttk_km
      sadovoe_km
      bulvar_ring_km
      kremlin_km
      big_road1_km
      ID_big_road1
      big_road1_1line
      big_road2_km
      ID_big_road2
      railroad_km
      railroad_1line
      zd_vokzaly_avto_km
      ID_railroad_terminal
      bus_terminal_avto_km
      ID_bus_terminal
      oil_chemistry_km
      nuclear_reactor_km
      radiation_km
      power_transmission_line_km
      thermal_power_plant_km
      ts_km
      big_market_km
      market_shop_km
      fitness_km
      swim_pool_km
      ice_rink_km
      stadium_km
      basketball_km
      hospice_morgue_km
      detention_facility_km
      public_healthcare_km
      university_km
      workplaces_km
      shopping_centers_km
      office_km
      additional_education_km
      preschool_km
      big_church_km
      church_synagogue_km
      mosque_km
      theater_km
      museum_km
      exhibition_km
      catering_km
      ecology
      green_part_500
      prom_part_500
      office_count_500
      office_sqm_500
      trc_count_500
      trc_sqm_500
      cafe_count_500
      cafe_sum_500_min_price_avg
      cafe_sum_500_max_price_avg
      cafe_avg_price_500
      cafe_count_500_na_price
      cafe_count_500_price_500
      cafe_count_500_price_1000
      cafe_count_500_price_1500
      cafe_count_500_price_2500
      cafe_count_500_price_4000
      cafe_count_500_price_high
      big_church_count_500
      church_count_500
      mosque_count_500
      leisure_count_500
      sport_count_500
      market_count_500
      green_part_1000
      prom_part_1000
      office_count_1000
      office_sqm_1000
      trc_count_1000
      trc_sqm_1000
      cafe_count_1000
      cafe_sum_1000_min_price_avg
      cafe_sum_1000_max_price_avg
      cafe_avg_price_1000
      cafe_count_1000_na_price
      cafe_count_1000_price_500
      cafe_count_1000_price_1000
      cafe_count_1000_price_1500
      cafe_count_1000_price_2500
      cafe_count_1000_price_4000
      cafe_count_1000_price_high
      big_church_count_1000
      church_count_1000
      mosque_count_1000
      leisure_count_1000
      sport_count_1000
      market_count_1000
      green_part_1500
      prom_part_1500
      office_count_1500
      office_sqm_1500
      trc_count_1500
      trc_sqm_1500
      cafe_count_1500
      cafe_sum_1500_min_price_avg
      cafe_sum_1500_max_price_avg
      cafe_avg_price_1500
      cafe_count_1500_na_price
      cafe_count_1500_price_500
      cafe_count_1500_price_1000
      cafe_count_1500_price_1500
      cafe_count_1500_price_2500
      cafe_count_1500_price_4000
      cafe_count_1500_price_high
      big_church_count_1500
      church_count_1500
      mosque_count_1500
      leisure_count_1500
      sport_count_1500
      market_count_1500
      green_part_2000
      prom_part_2000
      office_count_2000
      office_sqm_2000
      trc_count_2000
      trc_sqm_2000
      cafe_count_2000
      cafe_sum_2000_min_price_avg
      cafe_sum_2000_max_price_avg
      cafe_avg_price_2000
      cafe_count_2000_na_price
      cafe_count_2000_price_500
      cafe_count_2000_price_1000
      cafe_count_2000_price_1500
      cafe_count_2000_price_2500
      cafe_count_2000_price_4000
      cafe_count_2000_price_high
      big_church_count_2000
      church_count_2000
      mosque_count_2000
      leisure_count_2000
      sport_count_2000
      market_count_2000
      green_part_3000
      prom_part_3000
      office_count_3000
      office_sqm_3000
      trc_count_3000
      trc_sqm_3000
      cafe_count_3000
      cafe_sum_3000_min_price_avg
      cafe_sum_3000_max_price_avg
      cafe_avg_price_3000
      cafe_count_3000_na_price
      cafe_count_3000_price_500
      cafe_count_3000_price_1000
      cafe_count_3000_price_1500
      cafe_count_3000_price_2500
      cafe_count_3000_price_4000
      cafe_count_3000_price_high
      big_church_count_3000
      church_count_3000
      mosque_count_3000
      leisure_count_3000
      sport_count_3000
      market_count_3000
      green_part_5000
      prom_part_5000
      office_count_5000
      office_sqm_5000
      trc_count_5000
      trc_sqm_5000
      cafe_count_5000
      cafe_sum_5000_min_price_avg
      cafe_sum_5000_max_price_avg
      cafe_avg_price_5000
      cafe_count_5000_na_price
      cafe_count_5000_price_500
      cafe_count_5000_price_1000
      cafe_count_5000_price_1500
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      market_count_5000
      price_doc
    
  
  
    
      0
      1
      2011-08-20
      43
      27.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Bibirevo
      6.407578e+06
      155572
      0.189727
      0.000070
      9576
      5001.0
      5.0
      10309
      11065.0
      5
      0
      240.0
      1
      0
      7
      3
      no
      0
      16
      1
      no
      no
      no
      no
      no
      no
      no
      no
      86206
      40477
      45729
      21154
      11007
      10147
      98207
      52277
      45930
      36211
      10580
      25631
      9576
      4899
      4677
      10309
      5463
      4846
      23603
      12286
      11317
      17508
      9425
      8083
      18654
      9709
      8945
      211.0
      25.0
      0.0
      0.0
      0.0
      2.0
      184.0
      0.0
      0.0
      0.0
      211.0
      0.0
      0.0
      0.0
      206.0
      5.0
      1
      2.590241
      1.131260
      13.575119
      1.131260
      0.145700
      0.177975
      2.158587
      0.600973
      1.080934
      23.683460
      1.804127
      3.633334
      5.419893
      65.038716
      1.0
      5.419893
      6.905893
      1
      0.274985
      3.299822
      0.992631
      no
      1.422391
      10.918587
      13.100618
      13.675657
      15.156211
      1.422391
      1
      no
      3.830951
      5
      1.305159
      no
      14.231961
      101
      24.292406
      1
      18.152338
      5.718519
      1.210027
      1.062513
      5.814135
      4.308127
      10.814172
      1.676258
      0.485841
      3.065047
      1.107594
      8.148591
      3.516513
      2.392353
      4.248036
      0.974743
      6.715026
      0.884350
      0.648488
      0.637189
      0.947962
      0.177975
      0.625783
      0.628187
      3.932040
      14.053047
      7.389498
      7.023705
      0.516838
      good
      0.00
      0.00
      0
      0
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      7.36
      0.00
      1
      30500
      3
      55600
      19
      527.78
      888.89
      708.33
      1
      10
      4
      3
      1
      0
      0
      1
      2
      0
      0
      6
      1
      14.27
      6.92
      3
      39554
      9
      171420
      34
      566.67
      969.70
      768.18
      1
      14
      11
      6
      2
      0
      0
      1
      2
      0
      0
      7
      1
      11.77
      15.97
      9
      188854
      19
      1244891
      36
      614.29
      1042.86
      828.57
      1
      15
      11
      6
      2
      1
      0
      1
      2
      0
      0
      10
      1
      11.98
      13.55
      12
      251554
      23
      1419204
      68
      639.68
      1079.37
      859.52
      5
      21
      22
      16
      3
      1
      0
      2
      4
      0
      0
      21
      1
      13.09
      13.31
      29
      807385
      52
      4036616
      152
      708.57
      1185.71
      947.14
      12
      39
      48
      40
      9
      4
      0
      13
      22
      1
      0
      52
      4
      5850000
    
    
      1
      2
      2011-08-23
      34
      19.0
      3.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Nagatinskij Zaton
      9.589337e+06
      115352
      0.372602
      0.049637
      6880
      3119.0
      5.0
      7759
      6237.0
      8
      0
      229.0
      1
      0
      6
      1
      yes
      1
      3
      0
      no
      no
      no
      no
      no
      no
      no
      no
      76284
      34200
      42084
      15727
      7925
      7802
      70194
      35622
      34572
      29431
      9266
      20165
      6880
      3466
      3414
      7759
      3909
      3850
      17700
      8998
      8702
      15164
      7571
      7593
      13729
      6929
      6800
      245.0
      83.0
      1.0
      0.0
      67.0
      4.0
      90.0
      0.0
      0.0
      0.0
      244.0
      1.0
      1.0
      143.0
      84.0
      15.0
      2
      0.936700
      0.647337
      7.620630
      0.635053
      0.147754
      0.273345
      0.550690
      0.065321
      0.966479
      1.317476
      4.655004
      8.648587
      3.411993
      40.943917
      2.0
      3.641773
      4.679745
      2
      0.065263
      0.783160
      0.698081
      no
      9.503405
      3.103996
      6.444333
      8.132640
      8.698054
      2.887377
      2
      no
      3.103996
      4
      0.694536
      no
      9.242586
      32
      5.706113
      2
      9.034642
      3.489954
      2.724295
      1.246149
      3.419574
      0.725560
      6.910568
      3.424716
      0.668364
      2.000154
      8.972823
      6.127073
      1.161579
      2.543747
      12.649879
      1.477723
      1.852560
      0.686252
      0.519311
      0.688796
      1.072315
      0.273345
      0.967821
      0.471447
      4.841544
      6.829889
      0.709260
      2.358840
      0.230287
      excellent
      25.14
      0.00
      0
      0
      0
      0
      5
      860.00
      1500.00
      1180.00
      0
      1
      3
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      26.66
      0.07
      2
      86600
      5
      94065
      13
      615.38
      1076.92
      846.15
      0
      5
      6
      1
      0
      1
      0
      1
      2
      0
      4
      2
      0
      21.53
      7.71
      3
      102910
      7
      127065
      17
      694.12
      1205.88
      950.00
      0
      6
      7
      1
      2
      1
      0
      1
      5
      0
      4
      9
      0
      22.37
      19.25
      4
      165510
      8
      179065
      21
      695.24
      1190.48
      942.86
      0
      7
      8
      3
      2
      1
      0
      1
      5
      0
      4
      11
      0
      18.07
      27.32
      12
      821986
      14
      491565
      30
      631.03
      1086.21
      858.62
      1
      11
      11
      4
      2
      1
      0
      1
      7
      0
      6
      19
      1
      10.26
      27.47
      66
      2690465
      40
      2034942
      177
      673.81
      1148.81
      911.31
      9
      49
      65
      36
      15
      3
      0
      15
      29
      1
      10
      66
      14
      6000000
    
    
      2
      3
      2011-08-27
      43
      29.0
      2.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Tekstil'shhiki
      4.808270e+06
      101708
      0.112560
      0.118537
      5879
      1463.0
      4.0
      6207
      5580.0
      7
      0
      1183.0
      1
      0
      5
      1
      no
      0
      0
      1
      no
      no
      no
      yes
      no
      no
      no
      no
      101982
      46076
      55906
      13028
      6835
      6193
      63388
      31813
      31575
      25292
      7609
      17683
      5879
      3095
      2784
      6207
      3269
      2938
      14884
      7821
      7063
      19401
      9045
      10356
      11252
      5916
      5336
      330.0
      59.0
      0.0
      0.0
      206.0
      4.0
      60.0
      0.0
      1.0
      0.0
      330.0
      1.0
      0.0
      246.0
      63.0
      20.0
      3
      2.120999
      1.637996
      17.351515
      1.445960
      0.049102
      0.158072
      0.374848
      0.453172
      0.939275
      4.912660
      3.381083
      11.996480
      1.277658
      15.331896
      3.0
      1.277658
      1.701420
      3
      0.328756
      3.945073
      0.468265
      no
      5.604800
      2.927487
      6.963403
      8.054252
      9.067885
      0.647250
      3
      no
      2.927487
      4
      0.700691
      no
      9.540544
      5
      6.710302
      3
      5.777394
      7.506612
      0.772216
      1.602183
      3.682455
      3.562188
      5.752368
      1.375443
      0.733101
      1.239304
      1.978517
      0.767569
      1.952771
      0.621357
      7.682303
      0.097144
      0.841254
      1.510089
      1.486533
      1.543049
      0.391957
      0.158072
      3.178751
      0.755946
      7.922152
      4.273200
      3.156423
      4.958214
      0.190462
      poor
      1.67
      0.00
      0
      0
      0
      0
      3
      666.67
      1166.67
      916.67
      0
      0
      2
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      4.99
      0.29
      0
      0
      0
      0
      9
      642.86
      1142.86
      892.86
      2
      0
      5
      2
      0
      0
      0
      0
      1
      0
      0
      5
      3
      9.92
      6.73
      0
      0
      1
      2600
      14
      516.67
      916.67
      716.67
      2
      4
      6
      2
      0
      0
      0
      0
      4
      0
      0
      6
      5
      12.99
      12.75
      4
      100200
      7
      52550
      24
      563.64
      977.27
      770.45
      2
      8
      9
      4
      1
      0
      0
      0
      4
      0
      0
      8
      5
      12.14
      26.46
      8
      110856
      7
      52550
      41
      697.44
      1192.31
      944.87
      2
      9
      17
      9
      3
      1
      0
      0
      11
      0
      0
      20
      6
      13.69
      21.58
      43
      1478160
      35
      1572990
      122
      702.68
      1196.43
      949.55
      10
      29
      45
      25
      10
      3
      0
      11
      27
      0
      4
      67
      10
      5700000
    
    
      3
      4
      2011-09-01
      89
      50.0
      9.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Mitino
      1.258354e+07
      178473
      0.194703
      0.069753
      13087
      6839.0
      9.0
      13670
      17063.0
      10
      0
      NaN
      1
      0
      17
      6
      no
      0
      11
      4
      no
      no
      no
      no
      no
      no
      no
      no
      21155
      9828
      11327
      28563
      14680
      13883
      120381
      60040
      60341
      29529
      9083
      20446
      13087
      6645
      6442
      13670
      7126
      6544
      32063
      16513
      15550
      3292
      1450
      1842
      24934
      12782
      12152
      458.0
      9.0
      51.0
      12.0
      124.0
      50.0
      201.0
      0.0
      9.0
      2.0
      459.0
      13.0
      24.0
      40.0
      130.0
      252.0
      4
      1.489049
      0.984537
      11.565624
      0.963802
      0.179441
      0.236455
      0.078090
      0.106125
      0.451173
      15.623710
      2.017080
      14.317640
      4.291432
      51.497190
      4.0
      3.816045
      5.271136
      4
      0.131597
      1.579164
      1.200336
      no
      2.677824
      14.606501
      17.457198
      18.309433
      19.487005
      2.677824
      1
      no
      2.780449
      17
      1.999265
      no
      17.478380
      83
      6.734618
      1
      27.667863
      9.522538
      6.348716
      1.767612
      11.178333
      0.583025
      27.892717
      0.811275
      0.623484
      1.950317
      6.483172
      7.385521
      4.923843
      3.549558
      8.789894
      2.163735
      10.903161
      0.622272
      0.599914
      0.934273
      0.892674
      0.236455
      1.031777
      1.561505
      15.300449
      16.990677
      16.041521
      5.029696
      0.465820
      good
      17.36
      0.57
      0
      0
      0
      0
      2
      1000.00
      1500.00
      1250.00
      0
      0
      0
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      19.25
      10.35
      1
      11000
      6
      80780
      12
      658.33
      1083.33
      870.83
      0
      3
      4
      5
      0
      0
      0
      0
      0
      0
      0
      3
      1
      28.38
      6.57
      2
      11000
      7
      89492
      23
      673.91
      1130.43
      902.17
      0
      5
      9
      8
      1
      0
      0
      1
      0
      0
      0
      9
      2
      32.29
      5.73
      2
      11000
      7
      89492
      25
      660.00
      1120.00
      890.00
      0
      5
      11
      8
      1
      0
      0
      1
      1
      0
      0
      13
      2
      20.79
      3.57
      4
      167000
      12
      205756
      32
      718.75
      1218.75
      968.75
      0
      5
      14
      10
      3
      0
      0
      1
      2
      0
      0
      18
      3
      14.18
      3.89
      8
      244166
      22
      942180
      61
      931.58
      1552.63
      1242.11
      4
      7
      21
      15
      11
      2
      1
      4
      4
      0
      0
      26
      3
      13100000
    
    
      4
      5
      2011-09-05
      77
      77.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Basmannoe
      8.398461e+06
      108171
      0.015234
      0.037316
      5706
      3240.0
      7.0
      6748
      7770.0
      9
      0
      562.0
      4
      2
      25
      2
      no
      0
      10
      93
      no
      no
      no
      yes
      yes
      no
      no
      no
      28179
      13522
      14657
      13368
      7159
      6209
      68043
      34236
      33807
      26760
      8563
      18197
      5706
      2982
      2724
      6748
      3664
      3084
      15237
      8113
      7124
      5164
      2583
      2581
      11631
      6223
      5408
      746.0
      48.0
      0.0
      0.0
      643.0
      16.0
      35.0
      0.0
      3.0
      1.0
      746.0
      371.0
      114.0
      146.0
      62.0
      53.0
      5
      1.257186
      0.876620
      8.266305
      0.688859
      0.247901
      0.376838
      0.258289
      0.236214
      0.392871
      10.683540
      2.936581
      11.903910
      0.853960
      10.247521
      5.0
      1.595898
      2.156284
      113
      0.071480
      0.857764
      0.820294
      no
      11.616653
      1.721834
      0.046810
      0.787593
      2.578671
      1.721834
      4
      no
      3.133531
      10
      0.084113
      yes
      1.595898
      113
      1.423428
      4
      6.515857
      8.671016
      1.638318
      3.632640
      4.587917
      2.609420
      9.155057
      1.969738
      0.220288
      2.544696
      3.975401
      3.610754
      0.307915
      1.864637
      3.779781
      1.121703
      0.991683
      0.892668
      0.429052
      0.077901
      0.810801
      0.376838
      0.378756
      0.121681
      2.584370
      1.112486
      1.800125
      1.339652
      0.026102
      excellent
      3.56
      4.44
      15
      293699
      1
      45000
      48
      702.22
      1166.67
      934.44
      3
      17
      10
      11
      7
      0
      0
      1
      4
      0
      2
      3
      0
      3.34
      8.29
      46
      420952
      3
      158200
      153
      763.45
      1272.41
      1017.93
      8
      39
      45
      39
      19
      2
      1
      7
      12
      0
      6
      7
      0
      4.12
      4.83
      93
      1195735
      9
      445900
      272
      766.80
      1272.73
      1019.76
      19
      70
      74
      72
      30
      6
      1
      18
      30
      0
      10
      14
      2
      4.53
      5.02
      149
      1625130
      17
      564843
      483
      765.93
      1269.23
      1017.58
      28
      130
      129
      131
      50
      14
      1
      35
      61
      0
      17
      21
      3
      5.06
      8.62
      305
      3420907
      60
      2296870
      1068
      853.03
      1410.45
      1131.74
      63
      266
      267
      262
      149
      57
      4
      70
      121
      1
      40
      77
      5
      8.38
      10.92
      689
      8404624
      114
      3503058
      2283
      853.88
      1411.45
      1132.66
      143
      566
      578
      552
      319
      108
      17
      135
      236
      2
      91
      195
      14
      16331452



In [260]:

    
def update(source, patch):
    dtypes = source.dtypes
    source.update(patch, overwrite=True)
    for c, t in dtypes.iteritems():
        source[c] = source[c].astype(t)
    return source



In [261]:

    
train_raw.set_index("id")
test.set_index("id")
fx = pd.read_excel('data/BAD_ADDRESS_FIX.xlsx').drop_duplicates('id').set_index('id')

train_raw = update(train_raw, fx)
test = update(test, fx)

train_raw.reset_index()
test.reset_index()
print('Fix in train: ', train_raw.index.intersection(fx.index).shape[0])
print('Fix in test : ', test.index.intersection(fx.index).shape[0])

train_raw.to_csv("data/train_fix.csv", index=False, encoding="utf-8")
test.to_csv("data/test_fix.csv", index=False, encoding="utf-8")









    



('Fix in train: ', 500)
('Fix in test : ', 115)

Auto ML



In [266]:

    
from auto_ml import Predictor



In [267]:

    
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()









    Out[267]:







  
    
      
      id
      timestamp
      full_sq
      life_sq
      floor
      max_floor
      material
      build_year
      num_room
      kitch_sq
      state
      product_type
      sub_area
      area_m
      raion_popul
      green_zone_part
      indust_part
      children_preschool
      preschool_quota
      preschool_education_centers_raion
      children_school
      school_quota
      school_education_centers_raion
      school_education_centers_top_20_raion
      hospital_beds_raion
      healthcare_centers_raion
      university_top_20_raion
      sport_objects_raion
      additional_education_raion
      culture_objects_top_25
      culture_objects_top_25_raion
      shopping_centers_raion
      office_raion
      thermal_power_plant_raion
      incineration_raion
      oil_chemistry_raion
      radiation_raion
      railroad_terminal_raion
      big_market_raion
      nuclear_reactor_raion
      detention_facility_raion
      full_all
      male_f
      female_f
      young_all
      young_male
      young_female
      work_all
      work_male
      work_female
      ekder_all
      ekder_male
      ekder_female
      0_6_all
      0_6_male
      0_6_female
      7_14_all
      7_14_male
      7_14_female
      0_17_all
      0_17_male
      0_17_female
      16_29_all
      16_29_male
      16_29_female
      0_13_all
      0_13_male
      0_13_female
      raion_build_count_with_material_info
      build_count_block
      build_count_wood
      build_count_frame
      build_count_brick
      build_count_monolith
      build_count_panel
      build_count_foam
      build_count_slag
      build_count_mix
      raion_build_count_with_builddate_info
      build_count_before_1920
      build_count_1921-1945
      build_count_1946-1970
      build_count_1971-1995
      build_count_after_1995
      ID_metro
      metro_min_avto
      metro_km_avto
      metro_min_walk
      metro_km_walk
      kindergarten_km
      school_km
      park_km
      green_zone_km
      industrial_km
      water_treatment_km
      cemetery_km
      incineration_km
      railroad_station_walk_km
      railroad_station_walk_min
      ID_railroad_station_walk
      railroad_station_avto_km
      railroad_station_avto_min
      ID_railroad_station_avto
      public_transport_station_km
      public_transport_station_min_walk
      water_km
      water_1line
      mkad_km
      ttk_km
      sadovoe_km
      bulvar_ring_km
      kremlin_km
      big_road1_km
      ID_big_road1
      big_road1_1line
      big_road2_km
      ID_big_road2
      railroad_km
      railroad_1line
      zd_vokzaly_avto_km
      ID_railroad_terminal
      bus_terminal_avto_km
      ID_bus_terminal
      oil_chemistry_km
      nuclear_reactor_km
      radiation_km
      power_transmission_line_km
      thermal_power_plant_km
      ts_km
      big_market_km
      market_shop_km
      fitness_km
      swim_pool_km
      ice_rink_km
      stadium_km
      basketball_km
      hospice_morgue_km
      detention_facility_km
      public_healthcare_km
      university_km
      workplaces_km
      shopping_centers_km
      office_km
      additional_education_km
      preschool_km
      big_church_km
      church_synagogue_km
      mosque_km
      theater_km
      museum_km
      exhibition_km
      catering_km
      ecology
      green_part_500
      prom_part_500
      office_count_500
      office_sqm_500
      trc_count_500
      trc_sqm_500
      cafe_count_500
      cafe_sum_500_min_price_avg
      cafe_sum_500_max_price_avg
      cafe_avg_price_500
      cafe_count_500_na_price
      cafe_count_500_price_500
      cafe_count_500_price_1000
      cafe_count_500_price_1500
      cafe_count_500_price_2500
      cafe_count_500_price_4000
      cafe_count_500_price_high
      big_church_count_500
      church_count_500
      mosque_count_500
      leisure_count_500
      sport_count_500
      market_count_500
      green_part_1000
      prom_part_1000
      office_count_1000
      office_sqm_1000
      trc_count_1000
      trc_sqm_1000
      cafe_count_1000
      cafe_sum_1000_min_price_avg
      cafe_sum_1000_max_price_avg
      cafe_avg_price_1000
      cafe_count_1000_na_price
      cafe_count_1000_price_500
      cafe_count_1000_price_1000
      cafe_count_1000_price_1500
      cafe_count_1000_price_2500
      cafe_count_1000_price_4000
      cafe_count_1000_price_high
      big_church_count_1000
      church_count_1000
      mosque_count_1000
      leisure_count_1000
      sport_count_1000
      market_count_1000
      green_part_1500
      prom_part_1500
      office_count_1500
      office_sqm_1500
      trc_count_1500
      trc_sqm_1500
      cafe_count_1500
      cafe_sum_1500_min_price_avg
      cafe_sum_1500_max_price_avg
      cafe_avg_price_1500
      cafe_count_1500_na_price
      cafe_count_1500_price_500
      cafe_count_1500_price_1000
      cafe_count_1500_price_1500
      cafe_count_1500_price_2500
      cafe_count_1500_price_4000
      cafe_count_1500_price_high
      big_church_count_1500
      church_count_1500
      mosque_count_1500
      leisure_count_1500
      sport_count_1500
      market_count_1500
      green_part_2000
      prom_part_2000
      office_count_2000
      office_sqm_2000
      trc_count_2000
      trc_sqm_2000
      cafe_count_2000
      cafe_sum_2000_min_price_avg
      cafe_sum_2000_max_price_avg
      cafe_avg_price_2000
      cafe_count_2000_na_price
      cafe_count_2000_price_500
      cafe_count_2000_price_1000
      cafe_count_2000_price_1500
      cafe_count_2000_price_2500
      cafe_count_2000_price_4000
      cafe_count_2000_price_high
      big_church_count_2000
      church_count_2000
      mosque_count_2000
      leisure_count_2000
      sport_count_2000
      market_count_2000
      green_part_3000
      prom_part_3000
      office_count_3000
      office_sqm_3000
      trc_count_3000
      trc_sqm_3000
      cafe_count_3000
      cafe_sum_3000_min_price_avg
      cafe_sum_3000_max_price_avg
      cafe_avg_price_3000
      cafe_count_3000_na_price
      cafe_count_3000_price_500
      cafe_count_3000_price_1000
      cafe_count_3000_price_1500
      cafe_count_3000_price_2500
      cafe_count_3000_price_4000
      cafe_count_3000_price_high
      big_church_count_3000
      church_count_3000
      mosque_count_3000
      leisure_count_3000
      sport_count_3000
      market_count_3000
      green_part_5000
      prom_part_5000
      office_count_5000
      office_sqm_5000
      trc_count_5000
      trc_sqm_5000
      cafe_count_5000
      cafe_sum_5000_min_price_avg
      cafe_sum_5000_max_price_avg
      cafe_avg_price_5000
      cafe_count_5000_na_price
      cafe_count_5000_price_500
      cafe_count_5000_price_1000
      cafe_count_5000_price_1500
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      market_count_5000
      price_doc
    
  
  
    
      0
      1
      2011-08-20
      43
      27.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Bibirevo
      6.407578e+06
      155572
      0.189727
      0.000070
      9576
      5001.0
      5.0
      10309
      11065.0
      5
      0
      240.0
      1
      0
      7
      3
      no
      0
      16
      1
      no
      no
      no
      no
      no
      no
      no
      no
      86206
      40477
      45729
      21154
      11007
      10147
      98207
      52277
      45930
      36211
      10580
      25631
      9576
      4899
      4677
      10309
      5463
      4846
      23603
      12286
      11317
      17508
      9425
      8083
      18654
      9709
      8945
      211.0
      25.0
      0.0
      0.0
      0.0
      2.0
      184.0
      0.0
      0.0
      0.0
      211.0
      0.0
      0.0
      0.0
      206.0
      5.0
      1
      2.590241
      1.131260
      13.575119
      1.131260
      0.145700
      0.177975
      2.158587
      0.600973
      1.080934
      23.683460
      1.804127
      3.633334
      5.419893
      65.038716
      1.0
      5.419893
      6.905893
      1
      0.274985
      3.299822
      0.992631
      no
      1.422391
      10.918587
      13.100618
      13.675657
      15.156211
      1.422391
      1
      no
      3.830951
      5
      1.305159
      no
      14.231961
      101
      24.292406
      1
      18.152338
      5.718519
      1.210027
      1.062513
      5.814135
      4.308127
      10.814172
      1.676258
      0.485841
      3.065047
      1.107594
      8.148591
      3.516513
      2.392353
      4.248036
      0.974743
      6.715026
      0.884350
      0.648488
      0.637189
      0.947962
      0.177975
      0.625783
      0.628187
      3.932040
      14.053047
      7.389498
      7.023705
      0.516838
      good
      0.00
      0.00
      0
      0
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      7.36
      0.00
      1
      30500
      3
      55600
      19
      527.78
      888.89
      708.33
      1
      10
      4
      3
      1
      0
      0
      1
      2
      0
      0
      6
      1
      14.27
      6.92
      3
      39554
      9
      171420
      34
      566.67
      969.70
      768.18
      1
      14
      11
      6
      2
      0
      0
      1
      2
      0
      0
      7
      1
      11.77
      15.97
      9
      188854
      19
      1244891
      36
      614.29
      1042.86
      828.57
      1
      15
      11
      6
      2
      1
      0
      1
      2
      0
      0
      10
      1
      11.98
      13.55
      12
      251554
      23
      1419204
      68
      639.68
      1079.37
      859.52
      5
      21
      22
      16
      3
      1
      0
      2
      4
      0
      0
      21
      1
      13.09
      13.31
      29
      807385
      52
      4036616
      152
      708.57
      1185.71
      947.14
      12
      39
      48
      40
      9
      4
      0
      13
      22
      1
      0
      52
      4
      5850000
    
    
      1
      2
      2011-08-23
      34
      19.0
      3.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Nagatinskij Zaton
      9.589337e+06
      115352
      0.372602
      0.049637
      6880
      3119.0
      5.0
      7759
      6237.0
      8
      0
      229.0
      1
      0
      6
      1
      yes
      1
      3
      0
      no
      no
      no
      no
      no
      no
      no
      no
      76284
      34200
      42084
      15727
      7925
      7802
      70194
      35622
      34572
      29431
      9266
      20165
      6880
      3466
      3414
      7759
      3909
      3850
      17700
      8998
      8702
      15164
      7571
      7593
      13729
      6929
      6800
      245.0
      83.0
      1.0
      0.0
      67.0
      4.0
      90.0
      0.0
      0.0
      0.0
      244.0
      1.0
      1.0
      143.0
      84.0
      15.0
      2
      0.936700
      0.647337
      7.620630
      0.635053
      0.147754
      0.273345
      0.550690
      0.065321
      0.966479
      1.317476
      4.655004
      8.648587
      3.411993
      40.943917
      2.0
      3.641773
      4.679745
      2
      0.065263
      0.783160
      0.698081
      no
      9.503405
      3.103996
      6.444333
      8.132640
      8.698054
      2.887377
      2
      no
      3.103996
      4
      0.694536
      no
      9.242586
      32
      5.706113
      2
      9.034642
      3.489954
      2.724295
      1.246149
      3.419574
      0.725560
      6.910568
      3.424716
      0.668364
      2.000154
      8.972823
      6.127073
      1.161579
      2.543747
      12.649879
      1.477723
      1.852560
      0.686252
      0.519311
      0.688796
      1.072315
      0.273345
      0.967821
      0.471447
      4.841544
      6.829889
      0.709260
      2.358840
      0.230287
      excellent
      25.14
      0.00
      0
      0
      0
      0
      5
      860.00
      1500.00
      1180.00
      0
      1
      3
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      26.66
      0.07
      2
      86600
      5
      94065
      13
      615.38
      1076.92
      846.15
      0
      5
      6
      1
      0
      1
      0
      1
      2
      0
      4
      2
      0
      21.53
      7.71
      3
      102910
      7
      127065
      17
      694.12
      1205.88
      950.00
      0
      6
      7
      1
      2
      1
      0
      1
      5
      0
      4
      9
      0
      22.37
      19.25
      4
      165510
      8
      179065
      21
      695.24
      1190.48
      942.86
      0
      7
      8
      3
      2
      1
      0
      1
      5
      0
      4
      11
      0
      18.07
      27.32
      12
      821986
      14
      491565
      30
      631.03
      1086.21
      858.62
      1
      11
      11
      4
      2
      1
      0
      1
      7
      0
      6
      19
      1
      10.26
      27.47
      66
      2690465
      40
      2034942
      177
      673.81
      1148.81
      911.31
      9
      49
      65
      36
      15
      3
      0
      15
      29
      1
      10
      66
      14
      6000000
    
    
      2
      3
      2011-08-27
      43
      29.0
      2.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Tekstil'shhiki
      4.808270e+06
      101708
      0.112560
      0.118537
      5879
      1463.0
      4.0
      6207
      5580.0
      7
      0
      1183.0
      1
      0
      5
      1
      no
      0
      0
      1
      no
      no
      no
      yes
      no
      no
      no
      no
      101982
      46076
      55906
      13028
      6835
      6193
      63388
      31813
      31575
      25292
      7609
      17683
      5879
      3095
      2784
      6207
      3269
      2938
      14884
      7821
      7063
      19401
      9045
      10356
      11252
      5916
      5336
      330.0
      59.0
      0.0
      0.0
      206.0
      4.0
      60.0
      0.0
      1.0
      0.0
      330.0
      1.0
      0.0
      246.0
      63.0
      20.0
      3
      2.120999
      1.637996
      17.351515
      1.445960
      0.049102
      0.158072
      0.374848
      0.453172
      0.939275
      4.912660
      3.381083
      11.996480
      1.277658
      15.331896
      3.0
      1.277658
      1.701420
      3
      0.328756
      3.945073
      0.468265
      no
      5.604800
      2.927487
      6.963403
      8.054252
      9.067885
      0.647250
      3
      no
      2.927487
      4
      0.700691
      no
      9.540544
      5
      6.710302
      3
      5.777394
      7.506612
      0.772216
      1.602183
      3.682455
      3.562188
      5.752368
      1.375443
      0.733101
      1.239304
      1.978517
      0.767569
      1.952771
      0.621357
      7.682303
      0.097144
      0.841254
      1.510089
      1.486533
      1.543049
      0.391957
      0.158072
      3.178751
      0.755946
      7.922152
      4.273200
      3.156423
      4.958214
      0.190462
      poor
      1.67
      0.00
      0
      0
      0
      0
      3
      666.67
      1166.67
      916.67
      0
      0
      2
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      4.99
      0.29
      0
      0
      0
      0
      9
      642.86
      1142.86
      892.86
      2
      0
      5
      2
      0
      0
      0
      0
      1
      0
      0
      5
      3
      9.92
      6.73
      0
      0
      1
      2600
      14
      516.67
      916.67
      716.67
      2
      4
      6
      2
      0
      0
      0
      0
      4
      0
      0
      6
      5
      12.99
      12.75
      4
      100200
      7
      52550
      24
      563.64
      977.27
      770.45
      2
      8
      9
      4
      1
      0
      0
      0
      4
      0
      0
      8
      5
      12.14
      26.46
      8
      110856
      7
      52550
      41
      697.44
      1192.31
      944.87
      2
      9
      17
      9
      3
      1
      0
      0
      11
      0
      0
      20
      6
      13.69
      21.58
      43
      1478160
      35
      1572990
      122
      702.68
      1196.43
      949.55
      10
      29
      45
      25
      10
      3
      0
      11
      27
      0
      4
      67
      10
      5700000
    
    
      3
      4
      2011-09-01
      89
      50.0
      9.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Mitino
      1.258354e+07
      178473
      0.194703
      0.069753
      13087
      6839.0
      9.0
      13670
      17063.0
      10
      0
      NaN
      1
      0
      17
      6
      no
      0
      11
      4
      no
      no
      no
      no
      no
      no
      no
      no
      21155
      9828
      11327
      28563
      14680
      13883
      120381
      60040
      60341
      29529
      9083
      20446
      13087
      6645
      6442
      13670
      7126
      6544
      32063
      16513
      15550
      3292
      1450
      1842
      24934
      12782
      12152
      458.0
      9.0
      51.0
      12.0
      124.0
      50.0
      201.0
      0.0
      9.0
      2.0
      459.0
      13.0
      24.0
      40.0
      130.0
      252.0
      4
      1.489049
      0.984537
      11.565624
      0.963802
      0.179441
      0.236455
      0.078090
      0.106125
      0.451173
      15.623710
      2.017080
      14.317640
      4.291432
      51.497190
      4.0
      3.816045
      5.271136
      4
      0.131597
      1.579164
      1.200336
      no
      2.677824
      14.606501
      17.457198
      18.309433
      19.487005
      2.677824
      1
      no
      2.780449
      17
      1.999265
      no
      17.478380
      83
      6.734618
      1
      27.667863
      9.522538
      6.348716
      1.767612
      11.178333
      0.583025
      27.892717
      0.811275
      0.623484
      1.950317
      6.483172
      7.385521
      4.923843
      3.549558
      8.789894
      2.163735
      10.903161
      0.622272
      0.599914
      0.934273
      0.892674
      0.236455
      1.031777
      1.561505
      15.300449
      16.990677
      16.041521
      5.029696
      0.465820
      good
      17.36
      0.57
      0
      0
      0
      0
      2
      1000.00
      1500.00
      1250.00
      0
      0
      0
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      19.25
      10.35
      1
      11000
      6
      80780
      12
      658.33
      1083.33
      870.83
      0
      3
      4
      5
      0
      0
      0
      0
      0
      0
      0
      3
      1
      28.38
      6.57
      2
      11000
      7
      89492
      23
      673.91
      1130.43
      902.17
      0
      5
      9
      8
      1
      0
      0
      1
      0
      0
      0
      9
      2
      32.29
      5.73
      2
      11000
      7
      89492
      25
      660.00
      1120.00
      890.00
      0
      5
      11
      8
      1
      0
      0
      1
      1
      0
      0
      13
      2
      20.79
      3.57
      4
      167000
      12
      205756
      32
      718.75
      1218.75
      968.75
      0
      5
      14
      10
      3
      0
      0
      1
      2
      0
      0
      18
      3
      14.18
      3.89
      8
      244166
      22
      942180
      61
      931.58
      1552.63
      1242.11
      4
      7
      21
      15
      11
      2
      1
      4
      4
      0
      0
      26
      3
      13100000
    
    
      4
      5
      2011-09-05
      77
      77.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Basmannoe
      8.398461e+06
      108171
      0.015234
      0.037316
      5706
      3240.0
      7.0
      6748
      7770.0
      9
      0
      562.0
      4
      2
      25
      2
      no
      0
      10
      93
      no
      no
      no
      yes
      yes
      no
      no
      no
      28179
      13522
      14657
      13368
      7159
      6209
      68043
      34236
      33807
      26760
      8563
      18197
      5706
      2982
      2724
      6748
      3664
      3084
      15237
      8113
      7124
      5164
      2583
      2581
      11631
      6223
      5408
      746.0
      48.0
      0.0
      0.0
      643.0
      16.0
      35.0
      0.0
      3.0
      1.0
      746.0
      371.0
      114.0
      146.0
      62.0
      53.0
      5
      1.257186
      0.876620
      8.266305
      0.688859
      0.247901
      0.376838
      0.258289
      0.236214
      0.392871
      10.683540
      2.936581
      11.903910
      0.853960
      10.247521
      5.0
      1.595898
      2.156284
      113
      0.071480
      0.857764
      0.820294
      no
      11.616653
      1.721834
      0.046810
      0.787593
      2.578671
      1.721834
      4
      no
      3.133531
      10
      0.084113
      yes
      1.595898
      113
      1.423428
      4
      6.515857
      8.671016
      1.638318
      3.632640
      4.587917
      2.609420
      9.155057
      1.969738
      0.220288
      2.544696
      3.975401
      3.610754
      0.307915
      1.864637
      3.779781
      1.121703
      0.991683
      0.892668
      0.429052
      0.077901
      0.810801
      0.376838
      0.378756
      0.121681
      2.584370
      1.112486
      1.800125
      1.339652
      0.026102
      excellent
      3.56
      4.44
      15
      293699
      1
      45000
      48
      702.22
      1166.67
      934.44
      3
      17
      10
      11
      7
      0
      0
      1
      4
      0
      2
      3
      0
      3.34
      8.29
      46
      420952
      3
      158200
      153
      763.45
      1272.41
      1017.93
      8
      39
      45
      39
      19
      2
      1
      7
      12
      0
      6
      7
      0
      4.12
      4.83
      93
      1195735
      9
      445900
      272
      766.80
      1272.73
      1019.76
      19
      70
      74
      72
      30
      6
      1
      18
      30
      0
      10
      14
      2
      4.53
      5.02
      149
      1625130
      17
      564843
      483
      765.93
      1269.23
      1017.58
      28
      130
      129
      131
      50
      14
      1
      35
      61
      0
      17
      21
      3
      5.06
      8.62
      305
      3420907
      60
      2296870
      1068
      853.03
      1410.45
      1131.74
      63
      266
      267
      262
      149
      57
      4
      70
      121
      1
      40
      77
      5
      8.38
      10.92
      689
      8404624
      114
      3503058
      2283
      853.88
      1411.45
      1132.66
      143
      566
      578
      552
      319
      108
      17
      135
      236
      2
      91
      195
      14
      16331452



In [268]:

    
train_pr = preprocess(train_raw)
train_pr = preprocess_categorial(train_pr)
train = feature_exclude(train_pr)



In [ ]:

    
# Tell auto_ml which column is 'output'
# Also note columns that aren't purely numerical
# Examples include ['nlp', 'date', 'categorical', 'ignore']
column_descriptions = {
  'price_doc': 'output'
}

ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

ml_predictor.train(train)

file_name = ml_predictor.save()
print file_name

# Score the model on test data
test_score = ml_predictor.score(df_test, df_test.MEDV)

Смотрим на данные



In [173]:

    
#Checking for missing data
NAs = pd.concat([
    train.isnull().sum(), 
    test_pr.isnull().sum()
], axis=1, keys=['Train', 'Test'])
NAs[NAs.sum(axis=1) > 0]









    Out[173]:







  
    
      
      Train
      Test
    
  
  
    
      ID_big_road1_sll
      3.0
      NaN
    
    
      ID_big_road2_sll
      1.0
      NaN
    
    
      ID_metro_sll
      15.0
      NaN
    
    
      ID_railroad_station_avto_sll
      13.0
      NaN
    
    
      ID_railroad_station_walk_sll
      11.0
      NaN
    
    
      age_of_building
      3142.0
      1049.0
    
    
      build_count_1921-1945
      2611.0
      1218.0
    
    
      build_count_1971-1995
      2611.0
      1218.0
    
    
      build_count_after_1995
      2611.0
      1218.0
    
    
      build_count_before_1920
      2611.0
      1218.0
    
    
      build_count_block
      2611.0
      1218.0
    
    
      build_count_brick
      2611.0
      1218.0
    
    
      build_count_frame
      2611.0
      1218.0
    
    
      build_count_mix
      2611.0
      1218.0
    
    
      build_count_monolith
      2611.0
      1218.0
    
    
      build_count_panel
      2611.0
      1218.0
    
    
      build_count_slag
      2611.0
      1218.0
    
    
      build_count_wood
      2611.0
      1218.0
    
    
      build_year
      NaN
      1606.0
    
    
      build_year_ten
      3807.0
      NaN
    
    
      cafe_avg_price_1000
      3337.0
      1222.0
    
    
      cafe_avg_price_1500
      2165.0
      821.0
    
    
      cafe_avg_price_2000
      935.0
      424.0
    
    
      cafe_avg_price_3000
      692.0
      182.0
    
    
      cafe_avg_price_500
      7599.0
      3159.0
    
    
      cafe_sum_1000_max_price_avg
      3337.0
      1222.0
    
    
      cafe_sum_1000_min_price_avg
      3337.0
      1222.0
    
    
      cafe_sum_1500_max_price_avg
      2165.0
      821.0
    
    
      cafe_sum_1500_min_price_avg
      2165.0
      821.0
    
    
      cafe_sum_2000_max_price_avg
      935.0
      424.0
    
    
      cafe_sum_2000_min_price_avg
      935.0
      424.0
    
    
      cafe_sum_3000_max_price_avg
      692.0
      182.0
    
    
      cafe_sum_3000_min_price_avg
      692.0
      182.0
    
    
      cafe_sum_5000_min_price_avg
      227.0
      128.0
    
    
      cafe_sum_500_max_price_avg
      7599.0
      3159.0
    
    
      cafe_sum_500_min_price_avg
      7599.0
      3159.0
    
    
      floor
      7.0
      0.0
    
    
      full_sq
      18.0
      4.0
    
    
      green_part_2000
      0.0
      19.0
    
    
      hospital_beds_raion
      8470.0
      3418.0
    
    
      kitch_sq
      4891.0
      2163.0
    
    
      life_sq
      3484.0
      1516.0
    
    
      max_floor
      4.0
      643.0
    
    
      metro_km_walk
      24.0
      34.0
    
    
      num_room
      6.0
      0.0
    
    
      preschool_quota
      3690.0
      1596.0
    
    
      prom_part_5000
      147.0
      92.0
    
    
      railroad_station_walk_min
      24.0
      34.0
    
    
      raion_build_count_with_material_info
      2611.0
      1218.0
    
    
      ratio_kitch_sq_full_sq
      4891.0
      2163.0
    
    
      ratio_kitch_sq_life_sq
      5146.0
      2214.0
    
    
      ratio_life_sq_full_sq
      3484.0
      1519.0
    
    
      rel_floor
      7.0
      643.0
    
    
      rel_kitch_sq
      4891.0
      2163.0
    
    
      rel_life_sq
      3484.0
      1519.0
    
    
      school_quota
      3687.0
      1595.0
    
    
      state
      3148.0
      694.0
    
    
      sub_area_sll
      2.0
      NaN



In [ ]:

	full_sq	life_sq	floor	max_floor	material	num_room	kitch_sq	state	area_m	raion_popul	green_zone_part	indust_part	children_preschool	preschool_quota	preschool_education_centers_raion	school_quota	school_education_centers_top_20_raion	hospital_beds_raion	healthcare_centers_raion	university_top_20_raion	sport_objects_raion	culture_objects_top_25_raion	shopping_centers_raion	office_raion	full_all	male_f	female_f	young_all	work_all	ekder_all	ekder_male	ekder_female	0_6_all	0_6_male	7_14_all	7_14_male	7_14_female	0_17_all	0_17_female	16_29_all	16_29_male	16_29_female	0_13_all	0_13_male	raion_build_count_with_material_info	build_count_block	build_count_wood	build_count_frame	build_count_brick	build_count_monolith	build_count_panel	build_count_slag	build_count_mix	build_count_before_1920	build_count_1921-1945	build_count_1971-1995	build_count_after_1995	metro_min_avto	metro_km_avto	metro_km_walk	kindergarten_km	school_km	park_km	green_zone_km	industrial_km	cemetery_km	incineration_km	railroad_station_walk_min	railroad_station_avto_km	railroad_station_avto_min	public_transport_station_min_walk	water_km	mkad_km	ttk_km	sadovoe_km	bulvar_ring_km	kremlin_km	big_road1_km	big_road2_km	railroad_km	zd_vokzaly_avto_km	ID_railroad_terminal	oil_chemistry_km	nuclear_reactor_km	radiation_km	power_transmission_line_km	thermal_power_plant_km	ts_km	big_market_km	market_shop_km	fitness_km	swim_pool_km	ice_rink_km	stadium_km	basketball_km	hospice_morgue_km	detention_facility_km	public_healthcare_km	university_km	workplaces_km	shopping_centers_km	office_km	additional_education_km	preschool_km	big_church_km	church_synagogue_km	mosque_km	theater_km	museum_km	exhibition_km	catering_km	green_part_500	prom_part_500	office_count_500	trc_count_500	trc_sqm_500	cafe_count_500	cafe_sum_500_min_price_avg	cafe_sum_500_max_price_avg	cafe_avg_price_500	cafe_count_500_na_price	cafe_count_500_price_500	cafe_count_500_price_1000	cafe_count_500_price_high	big_church_count_500	church_count_500	mosque_count_500	leisure_count_500	sport_count_500	green_part_1000	prom_part_1000	office_count_1000	office_sqm_1000	trc_count_1000	trc_sqm_1000	cafe_count_1000	cafe_sum_1000_min_price_avg	cafe_sum_1000_max_price_avg	cafe_avg_price_1000	cafe_count_1000_na_price	cafe_count_1000_price_500	cafe_count_1000_price_1000	cafe_count_1000_price_1500	cafe_count_1000_price_2500	cafe_count_1000_price_high	big_church_count_1000	church_count_1000	mosque_count_1000	leisure_count_1000	sport_count_1000	market_count_1000	green_part_1500	prom_part_1500	office_count_1500	office_sqm_1500	trc_count_1500	trc_sqm_1500	cafe_count_1500	cafe_sum_1500_min_price_avg	cafe_sum_1500_max_price_avg	cafe_avg_price_1500	cafe_count_1500_na_price	cafe_count_1500_price_500	cafe_count_1500_price_1000	cafe_count_1500_price_1500	cafe_count_1500_price_2500	cafe_count_1500_price_4000	cafe_count_1500_price_high	big_church_count_1500	mosque_count_1500	leisure_count_1500	sport_count_1500	market_count_1500	green_part_2000	prom_part_2000	office_count_2000	office_sqm_2000	trc_count_2000	trc_sqm_2000	cafe_count_2000	cafe_sum_2000_min_price_avg	cafe_sum_2000_max_price_avg	cafe_avg_price_2000	cafe_count_2000_na_price	cafe_count_2000_price_500	cafe_count_2000_price_1000	cafe_count_2000_price_1500	cafe_count_2000_price_2500	cafe_count_2000_price_4000	cafe_count_2000_price_high	big_church_count_2000	church_count_2000	mosque_count_2000	leisure_count_2000	sport_count_2000	market_count_2000	green_part_3000	prom_part_3000	office_count_3000	office_sqm_3000	trc_count_3000	trc_sqm_3000	cafe_count_3000	cafe_sum_3000_min_price_avg	cafe_sum_3000_max_price_avg	cafe_avg_price_3000	cafe_count_3000_na_price	cafe_count_3000_price_500	cafe_count_3000_price_2500	cafe_count_3000_price_4000	cafe_count_3000_price_high	big_church_count_3000	church_count_3000	mosque_count_3000	leisure_count_3000	sport_count_3000	market_count_3000	green_part_5000	prom_part_5000	office_sqm_5000	trc_count_5000	trc_sqm_5000	cafe_count_5000	cafe_sum_5000_min_price_avg	cafe_count_5000_na_price	cafe_count_5000_price_500	cafe_count_5000_price_1000	cafe_count_5000_price_1500	cafe_count_5000_price_2500	cafe_count_5000_price_4000	cafe_count_5000_price_high	big_church_count_5000	church_count_5000	mosque_count_5000	leisure_count_5000	sport_count_5000	price_doc	age_of_building	incineration_raion_bool	oil_chemistry_raion_bool	big_market_raion_bool	water_1line_bool	railroad_1line_bool	culture_objects_top_25_bool	rel_floor	rel_kitch_sq	rel_life_sq	build_year_ten	ratio_life_sq_full_sq	ratio_kitch_sq_life_sq	ratio_kitch_sq_full_sq	product_type_le	sub_area_le	culture_objects_top_25_le	thermal_power_plant_raion_le	incineration_raion_le	oil_chemistry_raion_le	radiation_raion_le	railroad_terminal_raion_le	big_market_raion_le	nuclear_reactor_raion_le	ID_metro_le	ID_railroad_station_walk_le	water_1line_le	ID_big_road1_le	ID_big_road2_le	railroad_1line_le	ID_bus_terminal_le	ecology_le	is_build_in_progress_le	material_cat_le	state_cat_le	product_type_sll	sub_area_sll	culture_objects_top_25_sll	thermal_power_plant_raion_sll	incineration_raion_sll	oil_chemistry_raion_sll	radiation_raion_sll	railroad_terminal_raion_sll	big_market_raion_sll	nuclear_reactor_raion_sll	detention_facility_raion_sll	ID_metro_sll	ID_railroad_station_walk_sll	ID_railroad_station_avto_sll	water_1line_sll	ID_big_road1_sll	big_road1_1line_sll	ID_big_road2_sll	railroad_1line_sll	ID_bus_terminal_sll	ecology_sll	is_build_in_progress_sll	material_cat_sll	state_cat_sll
id
7675	73.0	36.0	17.0	17.0	1.0	2.0	11.0	NaN	14883622.34	72131	0.024444	0.158249	7567	3848.0	4.0	8687.0	0	NaN	1	0	4	0	0	0	102828	47783	55045	13954	49242	8935	2488	6447	7567	3867	5731	3000	2731	15057	7273	26154	13689	12465	12659	6564	1204.0	12.0	793.0	36.0	179.0	14.0	97.0	64.0	9.0	298.0	382.0	42.0	176.0	1.786095	0.631513	0.631513	0.127746	0.211999	0.830687	0.800951	0.838160	2.650983	2.953143	34.680939	2.890078	4.609301	3.014127	0.810526	2.890537	10.868389	14.824881	15.565422	16.934712	2.223887	2.224715	1.881386	22.217934	5	8.379305	14.929271	1.007503	3.189669	9.700928	2.105310	12.015274	1.076679	0.811599	4.019799	4.019799	15.342829	3.517383	2.962486	14.068434	0.375622	5.862093	2.680298	2.285872	3.130025	0.222339	0.211999	1.897119	1.887930	9.372257	5.573473	5.052496	4.095476	0.642124	0.00	0.00	0	0	0	0	NaN	NaN	NaN	0	0	0	0	0	0	0	0	0	4.06	1.27	0	0	0	0	2	750.00	1250.00	1000.00	0	0	1	1	0	0	0	0	0	0	1	1	10.47	7.43	0	0	0	0	2	750.00	1250.00	1000.00	0	0	1	1	0	0	0	0	0	0	1	1	13.61	8.44	0	0	0	0	3	666.67	1166.67	916.67	0	0	2	1	0	0	0	1	3	0	0	1	1	15.92	8.28	0	0	3	62100	10	750.00	1250.00	1000.00	0	0	0	0	0	4	4	0	0	8	1	15.84	6.22	138650	20	405046	38	686.49	1	8	16	11	1	1	0	5	9	0	2	17	16.128046	NaN	True	False	False	False	False	False	1.000000	0.150685	0.493151	NaN	0.493151	0.305556	0.150685	0	45	0	0	1	0	1	0	0	0	12	25	0	2	55	0	7	1	0	0	0	15.801033	15.729914	15.717359	15.731408	15.631294	15.737393	15.812208	15.732827	15.741379	15.731524	15.729544	15.782371	15.738976	15.717732	15.742079	15.601210	15.732289	15.636870	15.736438	15.658769	15.757357	15.739732	15.706233	15.603761
8059	11.0	11.0	2.0	5.0	2.0	1.0	NaN	3.0	10071560.22	102726	0.048791	0.000000	6374	165.0	5.0	9337.0	1	4702.0	5	1	23	2	5	87	75377	34015	41362	14868	61102	26756	8775	17981	6374	3205	7538	3585	3953	16584	8501	14705	7343	7362	13042	6343	641.0	19.0	4.0	0.0	550.0	48.0	8.0	11.0	1.0	206.0	122.0	42.0	91.0	1.798776	1.291876	0.582523	0.377428	0.185809	0.985279	0.465981	1.659437	2.325364	14.002650	29.369625	4.238200	5.481681	2.639782	0.514685	12.114726	2.301037	0.189294	1.310001	2.109561	2.301037	2.902523	1.622346	4.742795	50	10.902881	4.169488	0.624461	5.162609	1.846188	4.797509	9.892177	3.609579	0.773059	1.255166	3.526529	2.069458	1.728451	0.551081	5.850548	1.222891	1.519220	0.185809	0.300637	0.112276	0.683229	0.197451	0.373226	0.264107	2.761752	2.985751	0.224448	1.592383	0.057430	0.17	0.00	14	1	2720	42	985.37	1597.56	1291.46	1	8	7	0	1	4	0	9	0	12.07	0.00	44	363707	2	10220	99	866.67	1416.67	1141.67	9	25	17	28	16	0	5	12	0	12	7	0	15.03	0.00	81	677136	5	122060	225	901.93	1487.92	1194.93	18	49	50	56	39	13	0	14	0	18	21	0	11.75	0.82	129	1394447	17	457342	474	912.41	1503.45	1207.93	39	103	115	114	69	30	4	24	45	0	24	36	3	9.80	1.21	258	3025460	28	1002718	979	933.08	1529.12	1231.10	69	217	143	63	12	60	102	1	44	79	4	9.38	4.35	10742760	83	3434795	2295	908.42	157	539	537	562	339	135	26	133	207	1	89	161	14.827112	106.0	False	False	False	False	False	True	0.400000	NaN	1.000000	191.0	1.000000	NaN	NaN	0	29	1	0	0	0	1	0	0	0	192	160	0	33	7	0	4	0	0	1	3	15.801033	16.524056	15.971515	15.731408	15.743042	15.737393	15.812208	15.732827	15.741379	15.731524	15.729544	16.191397	16.237215	16.391445	15.742079	15.949550	15.732289	16.025274	15.736438	15.948738	15.828350	15.739732	15.831025	15.838145
8114	85.0	NaN	13.0	22.0	4.0	3.0	NaN	NaN	25536296.81	4001	0.496315	0.007122	275	NaN	0.0	NaN	0	NaN	0	0	0	0	1	0	17790	8350	9443	574	2566	861	244	617	275	143	264	136	128	646	311	3796	2035	1762	506	261	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.857265	2.285812	2.285812	1.402479	1.771738	5.118129	0.040645	0.288289	1.691135	19.639820	51.590936	4.299245	5.486797	0.855866	0.935755	7.683392	17.691722	20.412259	21.581123	22.450272	2.855063	3.550355	1.864007	26.263040	50	30.512432	17.961847	9.591465	6.035566	10.620355	4.266105	16.109519	6.997373	2.085627	3.085270	3.279429	14.462098	8.863268	7.339226	25.648392	2.616487	13.457175	9.891581	1.258676	4.329688	1.774558	1.771738	1.559771	1.540396	13.217829	14.023084	10.446626	5.115783	0.528478	38.39	2.83	0	0	0	0	NaN	NaN	NaN	0	0	0	0	0	0	0	0	0	40.95	1.52	0	0	0	0	2	1000.00	1750.00	1375.00	0	0	1	0	1	0	0	0	0	0	0	0	38.34	1.30	0	0	1	17000	2	1000.00	1750.00	1375.00	0	0	1	0	1	0	0	0	0	0	0	0	41.01	1.90	0	0	1	17000	3	833.33	1500.00	1166.67	0	0	2	0	1	0	0	1	3	0	0	0	0	44.85	1.97	0	0	2	22000	10	680.00	1200.00	940.00	0	1	1	0	0	1	3	0	0	6	0	35.53	5.77	117300	3	139300	18	758.82	1	3	8	4	1	1	0	2	12	0	0	9	15.840298	NaN	False	False	False	False	False	False	0.590909	NaN	NaN	NaN	NaN	NaN	NaN	1	103	0	0	0	0	0	0	0	0	123	102	0	4	19	0	12	2	0	2	0	15.602368	15.670687	15.717359	15.731408	15.743042	15.737393	15.690301	15.732827	15.741379	15.731524	15.729544	15.668391	15.492049	15.691381	15.742079	15.503679	15.732289	15.634725	15.736438	15.582105	15.467725	15.739732	15.919001	15.603761
8138	53.0	30.0	10.0	16.0	1.0	2.0	8.0	3.0	5646405.14	79576	0.258663	0.101872	4857	2703.0	5.0	7236.0	0	NaN	3	0	4	0	3	3	68630	33005	35625	10019	51295	18262	5511	12751	4857	2424	4583	2341	2242	11158	5468	15292	7613	7679	8865	4433	301.0	9.0	71.0	47.0	71.0	13.0	84.0	4.0	2.0	0.0	52.0	108.0	96.0	1.959499	1.503698	1.535023	0.408673	0.364994	0.875814	0.529494	0.000000	1.987032	3.252337	42.776109	3.619377	4.524221	2.264785	1.162500	2.169200	11.018216	13.270117	13.854330	15.345902	2.116263	2.169200	0.619521	13.396969	101	19.134967	4.012248	0.563397	2.092171	4.335473	4.442725	9.819833	2.238991	0.821462	0.792641	2.875125	9.328147	2.044473	1.518608	5.627885	1.247381	5.880034	1.038248	0.777665	0.162024	0.447976	0.364994	1.410243	1.412551	3.985309	13.058708	7.737190	7.572115	0.659322	0.00	40.26	2	0	0	0	NaN	NaN	NaN	0	0	0	0	0	0	0	0	1	13.80	30.07	3	71754	4	92000	9	487.50	875.00	681.25	1	3	4	1	0	0	0	0	0	0	3	0	16.16	17.90	5	106254	7	105414	26	775.00	1270.83	1022.92	2	7	6	8	2	1	0	2	0	0	7	1	16.81	17.64	10	179554	12	305634	41	674.36	1141.03	907.69	2	11	15	10	2	1	0	3	3	0	0	10	2	12.36	16.69	14	529054	24	838534	70	742.42	1234.85	988.64	4	20	5	3	0	3	4	0	0	24	2	15.44	18.47	766701	48	2311301	142	736.15	12	34	43	39	10	4	0	11	21	1	0	46	16.012735	33.0	False	False	False	False	False	False	0.625000	0.150943	0.566038	198.0	0.566038	0.266667	0.150943	0	55	0	0	0	0	1	0	0	0	202	224	0	43	0	0	0	3	0	0	3	15.801033	15.743523	15.717359	15.731408	15.743042	15.737393	15.812208	15.732827	15.741379	15.731524	15.729544	15.696960	15.772069	15.749849	15.742079	15.786004	15.732289	15.692321	15.736438	15.781152	15.840672	15.739732	15.706233	15.838145
8147	41.0	37.0	13.0	17.0	1.0	1.0	NaN	NaN	21494094.80	7122	0.262459	0.017647	489	NaN	0.0	NaN	0	NaN	0	0	0	0	0	0	9553	4529	5024	1021	4568	1533	435	1099	489	254	469	242	228	1150	553	2155	1206	950	900	465	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.769542	2.155282	2.155282	0.618740	0.827115	2.031773	0.469741	1.281134	1.882638	11.672420	68.215230	5.684602	7.053650	5.603917	0.226807	7.370562	20.604213	23.724567	24.996049	25.701769	4.841627	5.344093	1.250001	28.972859	32	23.727404	17.317707	7.550486	4.835593	10.058643	9.568216	14.068657	6.187529	0.888687	5.093315	13.230213	21.811800	10.140839	4.375357	26.882114	2.710695	18.634244	9.017858	3.407867	5.350487	1.492907	0.827115	1.841105	0.630511	3.003040	21.372786	15.160570	13.306110	1.554069	0.76	0.00	0	0	0	0	NaN	NaN	NaN	0	0	0	0	0	0	0	0	0	10.51	0.00	0	0	0	0	0	NaN	NaN	NaN	0	0	0	0	0	0	0	1	0	0	1	0	12.80	0.42	0	0	0	0	0	NaN	NaN	NaN	0	0	0	0	0	0	0	0	0	0	1	0	11.64	1.45	0	0	0	0	2	1000.00	1500.00	1250.00	0	0	0	2	0	0	0	1	2	0	0	3	0	11.09	2.01	0	0	0	0	2	1000.00	1500.00	1250.00	0	0	0	0	0	1	3	0	0	5	0	17.97	4.45	0	5	262000	18	700.00	2	4	4	8	0	0	0	1	8	1	0	12	15.310076	NaN	False	False	False	False	False	False	0.764706	NaN	0.902439	NaN	0.902439	NaN	NaN	1	105	0	0	0	0	0	0	0	0	164	134	0	11	31	0	13	2	0	0	0	15.602368	15.582945	15.717359	15.731408	15.743042	15.737393	15.690301	15.732827	15.741379	15.731524	15.729544	15.445493	15.394814	15.394814	15.742079	15.703291	15.732289	15.278771	15.736438	15.611353	15.467725	15.739732	15.706233	15.603761

	feature	importance
114	full_sq	2183
130	age_of_building	650
192	life_sq	561
236	floor	545
28	max_floor	540
198	kindergarten_km	338
99	kitch_sq	297
44	rel_life_sq	296
115	state	278
83	workplaces_km	264

	imp	name
0	1694	full_sq
236	961	age_of_building
3	470	max_floor
2	338	floor
272	333	sub_area_sll
282	302	ID_metro_sll
245	299	rel_life_sq
1	292	life_sq
7	260	state
283	254	ID_railroad_station_walk_sll
60	252	kindergarten_km
248	252	ratio_kitch_sq_life_sq
97	249	public_healthcare_km
130	240	prom_part_1000
70	232	public_transport_station_min_walk
111	231	green_part_500
6	229	kitch_sq
244	221	rel_kitch_sq
99	220	workplaces_km
102	214	additional_education_km

	price_doc	p_0	p_1
id
24320	15.952724	8.227190e+06	8096738.0
24321	15.621715	6.601231e+06	6750821.5
24322	15.919645	8.226800e+06	7768140.0
24324	15.645554	7.806705e+06	8211750.5
24325	15.279985	3.973034e+06	4257109.0

	id	timestamp	full_sq	life_sq	floor	max_floor	material	build_year	num_room	kitch_sq	state	product_type	sub_area	area_m	raion_popul	green_zone_part	indust_part	children_preschool	preschool_quota	preschool_education_centers_raion	children_school	school_quota	school_education_centers_raion	hospital_beds_raion	healthcare_centers_raion	university_top_20_raion	sport_objects_raion	additional_education_raion	culture_objects_top_25	culture_objects_top_25_raion	shopping_centers_raion	office_raion	thermal_power_plant_raion	incineration_raion	oil_chemistry_raion	radiation_raion	railroad_terminal_raion	big_market_raion	nuclear_reactor_raion	detention_facility_raion	full_all	male_f	female_f	young_all	young_male	young_female	work_all	work_male	work_female	ekder_all	ekder_male	ekder_female	0_6_all	0_6_male	0_6_female	7_14_all	7_14_male	7_14_female	0_17_all	0_17_male	0_17_female	16_29_all	16_29_male	16_29_female	0_13_all	0_13_male	0_13_female	raion_build_count_with_material_info	build_count_block	build_count_wood	build_count_frame	build_count_brick	build_count_monolith	build_count_panel	build_count_slag	build_count_mix	raion_build_count_with_builddate_info	build_count_before_1920	build_count_1921-1945	build_count_1946-1970	build_count_1971-1995	build_count_after_1995	ID_metro	metro_min_avto	metro_km_avto	metro_min_walk	metro_km_walk	kindergarten_km	school_km	park_km	green_zone_km	industrial_km	water_treatment_km	cemetery_km	incineration_km	railroad_station_walk_km	railroad_station_walk_min	ID_railroad_station_walk	railroad_station_avto_km	railroad_station_avto_min	ID_railroad_station_avto	public_transport_station_km	public_transport_station_min_walk	water_km	water_1line	mkad_km	ttk_km	sadovoe_km	bulvar_ring_km	kremlin_km	big_road1_km	ID_big_road1	big_road1_1line	big_road2_km	ID_big_road2	railroad_km	railroad_1line	zd_vokzaly_avto_km	ID_railroad_terminal	bus_terminal_avto_km	ID_bus_terminal	oil_chemistry_km	nuclear_reactor_km	radiation_km	power_transmission_line_km	thermal_power_plant_km	ts_km	big_market_km	market_shop_km	fitness_km	swim_pool_km	ice_rink_km	stadium_km	basketball_km	hospice_morgue_km	detention_facility_km	public_healthcare_km	university_km	workplaces_km	shopping_centers_km	office_km	additional_education_km	preschool_km	big_church_km	church_synagogue_km	mosque_km	theater_km	museum_km	exhibition_km	catering_km	ecology	green_part_500	prom_part_500	office_count_500	office_sqm_500	trc_count_500	trc_sqm_500	cafe_count_500	cafe_sum_500_min_price_avg	cafe_sum_500_max_price_avg	cafe_avg_price_500	cafe_count_500_na_price	cafe_count_500_price_500	cafe_count_500_price_1000	cafe_count_500_price_1500	cafe_count_500_price_2500	cafe_count_500_price_4000	big_church_count_500	church_count_500	leisure_count_500	sport_count_500	green_part_1000	prom_part_1000	office_count_1000	office_sqm_1000	trc_count_1000	trc_sqm_1000	cafe_count_1000	cafe_sum_1000_min_price_avg	cafe_sum_1000_max_price_avg	cafe_avg_price_1000	cafe_count_1000_na_price	cafe_count_1000_price_500	cafe_count_1000_price_1000	cafe_count_1000_price_1500	cafe_count_1000_price_2500	cafe_count_1000_price_4000	cafe_count_1000_price_high	big_church_count_1000	church_count_1000	leisure_count_1000	sport_count_1000	market_count_1000	green_part_1500	prom_part_1500	office_count_1500	office_sqm_1500	trc_count_1500	trc_sqm_1500	cafe_count_1500	cafe_sum_1500_min_price_avg	cafe_sum_1500_max_price_avg	cafe_avg_price_1500	cafe_count_1500_na_price	cafe_count_1500_price_500	cafe_count_1500_price_1000	cafe_count_1500_price_1500	cafe_count_1500_price_2500	cafe_count_1500_price_4000	cafe_count_1500_price_high	big_church_count_1500	church_count_1500	leisure_count_1500	sport_count_1500	market_count_1500	green_part_2000	prom_part_2000	office_count_2000	office_sqm_2000	trc_count_2000	trc_sqm_2000	cafe_count_2000	cafe_sum_2000_min_price_avg	cafe_sum_2000_max_price_avg	cafe_avg_price_2000	cafe_count_2000_na_price	cafe_count_2000_price_500	cafe_count_2000_price_1000	cafe_count_2000_price_1500	cafe_count_2000_price_2500	cafe_count_2000_price_4000	cafe_count_2000_price_high	big_church_count_2000	church_count_2000	leisure_count_2000	sport_count_2000	market_count_2000	green_part_3000	prom_part_3000	office_count_3000	office_sqm_3000	trc_count_3000	trc_sqm_3000	cafe_count_3000	cafe_sum_3000_min_price_avg	cafe_sum_3000_max_price_avg	cafe_avg_price_3000	cafe_count_3000_na_price	cafe_count_3000_price_500	cafe_count_3000_price_1000	cafe_count_3000_price_1500	cafe_count_3000_price_2500	cafe_count_3000_price_4000	cafe_count_3000_price_high	big_church_count_3000	church_count_3000	mosque_count_3000	leisure_count_3000	sport_count_3000	market_count_3000	green_part_5000	prom_part_5000	office_count_5000	office_sqm_5000	trc_count_5000	trc_sqm_5000	cafe_count_5000	cafe_sum_5000_min_price_avg	cafe_sum_5000_max_price_avg	cafe_avg_price_5000	cafe_count_5000_na_price	cafe_count_5000_price_500	cafe_count_5000_price_1000	cafe_count_5000_price_1500	cafe_count_5000_price_2500	cafe_count_5000_price_4000	cafe_count_5000_price_high	big_church_count_5000	church_count_5000	mosque_count_5000	leisure_count_5000	sport_count_5000	market_count_5000	price_doc
0	1	2011-08-20	43	27.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Bibirevo	6.407578e+06	155572	0.189727	0.000070	9576	5001.0	5.0	10309	11065.0	5	240.0	1	0	7	3	no	0	16	1	no	no	no	no	no	no	no	no	86206	40477	45729	21154	11007	10147	98207	52277	45930	36211	10580	25631	9576	4899	4677	10309	5463	4846	23603	12286	11317	17508	9425	8083	18654	9709	8945	211.0	25.0	0.0	0.0	0.0	2.0	184.0	0.0	0.0	211.0	0.0	0.0	0.0	206.0	5.0	1	2.590241	1.131260	13.575119	1.131260	0.145700	0.177975	2.158587	0.600973	1.080934	23.683460	1.804127	3.633334	5.419893	65.038716	1.0	5.419893	6.905893	1	0.274985	3.299822	0.992631	no	1.422391	10.918587	13.100618	13.675657	15.156211	1.422391	1	no	3.830951	5	1.305159	no	14.231961	101	24.292406	1	18.152338	5.718519	1.210027	1.062513	5.814135	4.308127	10.814172	1.676258	0.485841	3.065047	1.107594	8.148591	3.516513	2.392353	4.248036	0.974743	6.715026	0.884350	0.648488	0.637189	0.947962	0.177975	0.625783	0.628187	3.932040	14.053047	7.389498	7.023705	0.516838	good	0.00	0.00	0	0	0	0	0	NaN	NaN	NaN	0	0	0	0	0	0	0	0	0	1	7.36	0.00	1	30500	3	55600	19	527.78	888.89	708.33	1	10	4	3	1	0	0	1	2	0	6	1	14.27	6.92	3	39554	9	171420	34	566.67	969.70	768.18	1	14	11	6	2	0	0	1	2	0	7	1	11.77	15.97	9	188854	19	1244891	36	614.29	1042.86	828.57	1	15	11	6	2	1	0	1	2	0	10	1	11.98	13.55	12	251554	23	1419204	68	639.68	1079.37	859.52	5	21	22	16	3	1	0	2	4	0	0	21	1	13.09	13.31	29	807385	52	4036616	152	708.57	1185.71	947.14	12	39	48	40	9	4	0	13	22	1	0	52	4	5850000
1	2	2011-08-23	34	19.0	3.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Nagatinskij Zaton	9.589337e+06	115352	0.372602	0.049637	6880	3119.0	5.0	7759	6237.0	8	229.0	1	0	6	1	yes	1	3	0	no	no	no	no	no	no	no	no	76284	34200	42084	15727	7925	7802	70194	35622	34572	29431	9266	20165	6880	3466	3414	7759	3909	3850	17700	8998	8702	15164	7571	7593	13729	6929	6800	245.0	83.0	1.0	0.0	67.0	4.0	90.0	0.0	0.0	244.0	1.0	1.0	143.0	84.0	15.0	2	0.936700	0.647337	7.620630	0.635053	0.147754	0.273345	0.550690	0.065321	0.966479	1.317476	4.655004	8.648587	3.411993	40.943917	2.0	3.641773	4.679745	2	0.065263	0.783160	0.698081	no	9.503405	3.103996	6.444333	8.132640	8.698054	2.887377	2	no	3.103996	4	0.694536	no	9.242586	32	5.706113	2	9.034642	3.489954	2.724295	1.246149	3.419574	0.725560	6.910568	3.424716	0.668364	2.000154	8.972823	6.127073	1.161579	2.543747	12.649879	1.477723	1.852560	0.686252	0.519311	0.688796	1.072315	0.273345	0.967821	0.471447	4.841544	6.829889	0.709260	2.358840	0.230287	excellent	25.14	0.00	0	0	0	0	5	860.00	1500.00	1180.00	0	1	3	0	0	1	0	1	0	0	26.66	0.07	2	86600	5	94065	13	615.38	1076.92	846.15	0	5	6	1	0	1	0	1	2	4	2	0	21.53	7.71	3	102910	7	127065	17	694.12	1205.88	950.00	0	6	7	1	2	1	0	1	5	4	9	0	22.37	19.25	4	165510	8	179065	21	695.24	1190.48	942.86	0	7	8	3	2	1	0	1	5	4	11	0	18.07	27.32	12	821986	14	491565	30	631.03	1086.21	858.62	1	11	11	4	2	1	0	1	7	0	6	19	1	10.26	27.47	66	2690465	40	2034942	177	673.81	1148.81	911.31	9	49	65	36	15	3	0	15	29	1	10	66	14	6000000
2	3	2011-08-27	43	29.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Tekstil'shhiki	4.808270e+06	101708	0.112560	0.118537	5879	1463.0	4.0	6207	5580.0	7	1183.0	1	0	5	1	no	0	0	1	no	no	no	yes	no	no	no	no	101982	46076	55906	13028	6835	6193	63388	31813	31575	25292	7609	17683	5879	3095	2784	6207	3269	2938	14884	7821	7063	19401	9045	10356	11252	5916	5336	330.0	59.0	0.0	0.0	206.0	4.0	60.0	1.0	0.0	330.0	1.0	0.0	246.0	63.0	20.0	3	2.120999	1.637996	17.351515	1.445960	0.049102	0.158072	0.374848	0.453172	0.939275	4.912660	3.381083	11.996480	1.277658	15.331896	3.0	1.277658	1.701420	3	0.328756	3.945073	0.468265	no	5.604800	2.927487	6.963403	8.054252	9.067885	0.647250	3	no	2.927487	4	0.700691	no	9.540544	5	6.710302	3	5.777394	7.506612	0.772216	1.602183	3.682455	3.562188	5.752368	1.375443	0.733101	1.239304	1.978517	0.767569	1.952771	0.621357	7.682303	0.097144	0.841254	1.510089	1.486533	1.543049	0.391957	0.158072	3.178751	0.755946	7.922152	4.273200	3.156423	4.958214	0.190462	poor	1.67	0.00	0	0	0	0	3	666.67	1166.67	916.67	0	0	2	1	0	0	0	0	0	0	4.99	0.29	0	0	0	0	9	642.86	1142.86	892.86	2	0	5	2	0	0	0	0	1	0	5	3	9.92	6.73	0	0	1	2600	14	516.67	916.67	716.67	2	4	6	2	0	0	0	0	4	0	6	5	12.99	12.75	4	100200	7	52550	24	563.64	977.27	770.45	2	8	9	4	1	0	0	0	4	0	8	5	12.14	26.46	8	110856	7	52550	41	697.44	1192.31	944.87	2	9	17	9	3	1	0	0	11	0	0	20	6	13.69	21.58	43	1478160	35	1572990	122	702.68	1196.43	949.55	10	29	45	25	10	3	0	11	27	0	4	67	10	5700000
3	4	2011-09-01	89	50.0	9.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Mitino	1.258354e+07	178473	0.194703	0.069753	13087	6839.0	9.0	13670	17063.0	10	NaN	1	0	17	6	no	0	11	4	no	no	no	no	no	no	no	no	21155	9828	11327	28563	14680	13883	120381	60040	60341	29529	9083	20446	13087	6645	6442	13670	7126	6544	32063	16513	15550	3292	1450	1842	24934	12782	12152	458.0	9.0	51.0	12.0	124.0	50.0	201.0	9.0	2.0	459.0	13.0	24.0	40.0	130.0	252.0	4	1.489049	0.984537	11.565624	0.963802	0.179441	0.236455	0.078090	0.106125	0.451173	15.623710	2.017080	14.317640	4.291432	51.497190	4.0	3.816045	5.271136	4	0.131597	1.579164	1.200336	no	2.677824	14.606501	17.457198	18.309433	19.487005	2.677824	1	no	2.780449	17	1.999265	no	17.478380	83	6.734618	1	27.667863	9.522538	6.348716	1.767612	11.178333	0.583025	27.892717	0.811275	0.623484	1.950317	6.483172	7.385521	4.923843	3.549558	8.789894	2.163735	10.903161	0.622272	0.599914	0.934273	0.892674	0.236455	1.031777	1.561505	15.300449	16.990677	16.041521	5.029696	0.465820	good	17.36	0.57	0	0	0	0	2	1000.00	1500.00	1250.00	0	0	0	2	0	0	0	0	0	0	19.25	10.35	1	11000	6	80780	12	658.33	1083.33	870.83	0	3	4	5	0	0	0	0	0	0	3	1	28.38	6.57	2	11000	7	89492	23	673.91	1130.43	902.17	0	5	9	8	1	0	0	1	0	0	9	2	32.29	5.73	2	11000	7	89492	25	660.00	1120.00	890.00	0	5	11	8	1	0	0	1	1	0	13	2	20.79	3.57	4	167000	12	205756	32	718.75	1218.75	968.75	0	5	14	10	3	0	0	1	2	0	0	18	3	14.18	3.89	8	244166	22	942180	61	931.58	1552.63	1242.11	4	7	21	15	11	2	1	4	4	0	0	26	3	13100000
4	5	2011-09-05	77	77.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Basmannoe	8.398461e+06	108171	0.015234	0.037316	5706	3240.0	7.0	6748	7770.0	9	562.0	4	2	25	2	no	0	10	93	no	no	no	yes	yes	no	no	no	28179	13522	14657	13368	7159	6209	68043	34236	33807	26760	8563	18197	5706	2982	2724	6748	3664	3084	15237	8113	7124	5164	2583	2581	11631	6223	5408	746.0	48.0	0.0	0.0	643.0	16.0	35.0	3.0	1.0	746.0	371.0	114.0	146.0	62.0	53.0	5	1.257186	0.876620	8.266305	0.688859	0.247901	0.376838	0.258289	0.236214	0.392871	10.683540	2.936581	11.903910	0.853960	10.247521	5.0	1.595898	2.156284	113	0.071480	0.857764	0.820294	no	11.616653	1.721834	0.046810	0.787593	2.578671	1.721834	4	no	3.133531	10	0.084113	yes	1.595898	113	1.423428	4	6.515857	8.671016	1.638318	3.632640	4.587917	2.609420	9.155057	1.969738	0.220288	2.544696	3.975401	3.610754	0.307915	1.864637	3.779781	1.121703	0.991683	0.892668	0.429052	0.077901	0.810801	0.376838	0.378756	0.121681	2.584370	1.112486	1.800125	1.339652	0.026102	excellent	3.56	4.44	15	293699	1	45000	48	702.22	1166.67	934.44	3	17	10	11	7	0	1	4	2	3	3.34	8.29	46	420952	3	158200	153	763.45	1272.41	1017.93	8	39	45	39	19	2	1	7	12	6	7	0	4.12	4.83	93	1195735	9	445900	272	766.80	1272.73	1019.76	19	70	74	72	30	6	1	18	30	10	14	2	4.53	5.02	149	1625130	17	564843	483	765.93	1269.23	1017.58	28	130	129	131	50	14	1	35	61	17	21	3	5.06	8.62	305	3420907	60	2296870	1068	853.03	1410.45	1131.74	63	266	267	262	149	57	4	70	121	1	40	77	5	8.38	10.92	689	8404624	114	3503058	2283	853.88	1411.45	1132.66	143	566	578	552	319	108	17	135	236	2	91	195	14	16331452

	Train	Test
ID_big_road1_sll	3.0	NaN
ID_big_road2_sll	1.0	NaN
ID_metro_sll	15.0	NaN
ID_railroad_station_avto_sll	13.0	NaN
ID_railroad_station_walk_sll	11.0	NaN
age_of_building	3142.0	1049.0
build_count_1921-1945	2611.0	1218.0
build_count_1971-1995	2611.0	1218.0
build_count_after_1995	2611.0	1218.0
build_count_before_1920	2611.0	1218.0
build_count_block	2611.0	1218.0
build_count_brick	2611.0	1218.0
build_count_frame	2611.0	1218.0
build_count_mix	2611.0	1218.0
build_count_monolith	2611.0	1218.0
build_count_panel	2611.0	1218.0
build_count_slag	2611.0	1218.0
build_count_wood	2611.0	1218.0
build_year	NaN	1606.0
build_year_ten	3807.0	NaN
cafe_avg_price_1000	3337.0	1222.0
cafe_avg_price_1500	2165.0	821.0
cafe_avg_price_2000	935.0	424.0
cafe_avg_price_3000	692.0	182.0
cafe_avg_price_500	7599.0	3159.0
cafe_sum_1000_max_price_avg	3337.0	1222.0
cafe_sum_1000_min_price_avg	3337.0	1222.0
cafe_sum_1500_max_price_avg	2165.0	821.0
cafe_sum_1500_min_price_avg	2165.0	821.0
cafe_sum_2000_max_price_avg	935.0	424.0
cafe_sum_2000_min_price_avg	935.0	424.0
cafe_sum_3000_max_price_avg	692.0	182.0
cafe_sum_3000_min_price_avg	692.0	182.0
cafe_sum_5000_min_price_avg	227.0	128.0
cafe_sum_500_max_price_avg	7599.0	3159.0
cafe_sum_500_min_price_avg	7599.0	3159.0
floor	7.0	0.0
full_sq	18.0	4.0
green_part_2000	0.0	19.0
hospital_beds_raion	8470.0	3418.0
kitch_sq	4891.0	2163.0
life_sq	3484.0	1516.0
max_floor	4.0	643.0
metro_km_walk	24.0	34.0
num_room	6.0	0.0
preschool_quota	3690.0	1596.0
prom_part_5000	147.0	92.0
railroad_station_walk_min	24.0	34.0
raion_build_count_with_material_info	2611.0	1218.0
ratio_kitch_sq_full_sq	4891.0	2163.0
ratio_kitch_sq_life_sq	5146.0	2214.0
ratio_life_sq_full_sq	3484.0	1519.0
rel_floor	7.0	643.0
rel_kitch_sq	4891.0	2163.0
rel_life_sq	3484.0	1519.0
school_quota	3687.0	1595.0
state	3148.0	694.0
sub_area_sll	2.0	NaN