In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline
%load_ext ipycache

import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 300
import numpy as np
import scipy
import sklearn as sk
import xgboost as xgb

from eli5 import show_weights

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt


/Users/evgeny/Library/Python/2.7/lib/python/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated since IPython 4.0. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [
        (math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 
        for i,pred in enumerate(y_pred)
    ]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

def rmse(y, y_pred):
    return np.sqrt(((y_pred - y) ** 2).mean())

Препроцессинг фич


In [3]:
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()


Out[3]:
id timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km catering_km ecology green_part_500 prom_part_500 office_count_500 office_sqm_500 trc_count_500 trc_sqm_500 cafe_count_500 cafe_sum_500_min_price_avg cafe_sum_500_max_price_avg cafe_avg_price_500 cafe_count_500_na_price cafe_count_500_price_500 cafe_count_500_price_1000 cafe_count_500_price_1500 cafe_count_500_price_2500 cafe_count_500_price_4000 cafe_count_500_price_high big_church_count_500 church_count_500 mosque_count_500 leisure_count_500 sport_count_500 market_count_500 green_part_1000 prom_part_1000 office_count_1000 office_sqm_1000 trc_count_1000 trc_sqm_1000 cafe_count_1000 cafe_sum_1000_min_price_avg cafe_sum_1000_max_price_avg cafe_avg_price_1000 cafe_count_1000_na_price cafe_count_1000_price_500 cafe_count_1000_price_1000 cafe_count_1000_price_1500 cafe_count_1000_price_2500 cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
0 1 2011-08-20 43 27.0 4.0 NaN NaN NaN NaN NaN NaN Investment Bibirevo 6.407578e+06 155572 0.189727 0.000070 9576 5001.0 5.0 10309 11065.0 5 0 240.0 1 0 7 3 no 0 16 1 no no no no no no no no 86206 40477 45729 21154 11007 10147 98207 52277 45930 36211 10580 25631 9576 4899 4677 10309 5463 4846 23603 12286 11317 17508 9425 8083 18654 9709 8945 211.0 25.0 0.0 0.0 0.0 2.0 184.0 0.0 0.0 0.0 211.0 0.0 0.0 0.0 206.0 5.0 1 2.590241 1.131260 13.575119 1.131260 0.145700 0.177975 2.158587 0.600973 1.080934 23.683460 1.804127 3.633334 5.419893 65.038716 1.0 5.419893 6.905893 1 0.274985 3.299822 0.992631 no 1.422391 10.918587 13.100618 13.675657 15.156211 1.422391 1 no 3.830951 5 1.305159 no 14.231961 101 24.292406 1 18.152338 5.718519 1.210027 1.062513 5.814135 4.308127 10.814172 1.676258 0.485841 3.065047 1.107594 8.148591 3.516513 2.392353 4.248036 0.974743 6.715026 0.884350 0.648488 0.637189 0.947962 0.177975 0.625783 0.628187 3.932040 14.053047 7.389498 7.023705 0.516838 good 0.00 0.00 0 0 0 0 0 NaN NaN NaN 0 0 0 0 0 0 0 0 0 0 0 1 0 7.36 0.00 1 30500 3 55600 19 527.78 888.89 708.33 1 10 4 3 1 0 0 1 2 0 0 6 1 14.27 6.92 3 39554 9 171420 34 566.67 969.70 768.18 1 14 11 6 2 0 0 1 2 0 0 7 1 11.77 15.97 9 188854 19 1244891 36 614.29 1042.86 828.57 1 15 11 6 2 1 0 1 2 0 0 10 1 11.98 13.55 12 251554 23 1419204 68 639.68 1079.37 859.52 5 21 22 16 3 1 0 2 4 0 0 21 1 13.09 13.31 29 807385 52 4036616 152 708.57 1185.71 947.14 12 39 48 40 9 4 0 13 22 1 0 52 4 5850000
1 2 2011-08-23 34 19.0 3.0 NaN NaN NaN NaN NaN NaN Investment Nagatinskij Zaton 9.589337e+06 115352 0.372602 0.049637 6880 3119.0 5.0 7759 6237.0 8 0 229.0 1 0 6 1 yes 1 3 0 no no no no no no no no 76284 34200 42084 15727 7925 7802 70194 35622 34572 29431 9266 20165 6880 3466 3414 7759 3909 3850 17700 8998 8702 15164 7571 7593 13729 6929 6800 245.0 83.0 1.0 0.0 67.0 4.0 90.0 0.0 0.0 0.0 244.0 1.0 1.0 143.0 84.0 15.0 2 0.936700 0.647337 7.620630 0.635053 0.147754 0.273345 0.550690 0.065321 0.966479 1.317476 4.655004 8.648587 3.411993 40.943917 2.0 3.641773 4.679745 2 0.065263 0.783160 0.698081 no 9.503405 3.103996 6.444333 8.132640 8.698054 2.887377 2 no 3.103996 4 0.694536 no 9.242586 32 5.706113 2 9.034642 3.489954 2.724295 1.246149 3.419574 0.725560 6.910568 3.424716 0.668364 2.000154 8.972823 6.127073 1.161579 2.543747 12.649879 1.477723 1.852560 0.686252 0.519311 0.688796 1.072315 0.273345 0.967821 0.471447 4.841544 6.829889 0.709260 2.358840 0.230287 excellent 25.14 0.00 0 0 0 0 5 860.00 1500.00 1180.00 0 1 3 0 0 1 0 0 1 0 0 0 0 26.66 0.07 2 86600 5 94065 13 615.38 1076.92 846.15 0 5 6 1 0 1 0 1 2 0 4 2 0 21.53 7.71 3 102910 7 127065 17 694.12 1205.88 950.00 0 6 7 1 2 1 0 1 5 0 4 9 0 22.37 19.25 4 165510 8 179065 21 695.24 1190.48 942.86 0 7 8 3 2 1 0 1 5 0 4 11 0 18.07 27.32 12 821986 14 491565 30 631.03 1086.21 858.62 1 11 11 4 2 1 0 1 7 0 6 19 1 10.26 27.47 66 2690465 40 2034942 177 673.81 1148.81 911.31 9 49 65 36 15 3 0 15 29 1 10 66 14 6000000
2 3 2011-08-27 43 29.0 2.0 NaN NaN NaN NaN NaN NaN Investment Tekstil'shhiki 4.808270e+06 101708 0.112560 0.118537 5879 1463.0 4.0 6207 5580.0 7 0 1183.0 1 0 5 1 no 0 0 1 no no no yes no no no no 101982 46076 55906 13028 6835 6193 63388 31813 31575 25292 7609 17683 5879 3095 2784 6207 3269 2938 14884 7821 7063 19401 9045 10356 11252 5916 5336 330.0 59.0 0.0 0.0 206.0 4.0 60.0 0.0 1.0 0.0 330.0 1.0 0.0 246.0 63.0 20.0 3 2.120999 1.637996 17.351515 1.445960 0.049102 0.158072 0.374848 0.453172 0.939275 4.912660 3.381083 11.996480 1.277658 15.331896 3.0 1.277658 1.701420 3 0.328756 3.945073 0.468265 no 5.604800 2.927487 6.963403 8.054252 9.067885 0.647250 3 no 2.927487 4 0.700691 no 9.540544 5 6.710302 3 5.777394 7.506612 0.772216 1.602183 3.682455 3.562188 5.752368 1.375443 0.733101 1.239304 1.978517 0.767569 1.952771 0.621357 7.682303 0.097144 0.841254 1.510089 1.486533 1.543049 0.391957 0.158072 3.178751 0.755946 7.922152 4.273200 3.156423 4.958214 0.190462 poor 1.67 0.00 0 0 0 0 3 666.67 1166.67 916.67 0 0 2 1 0 0 0 0 0 0 0 0 0 4.99 0.29 0 0 0 0 9 642.86 1142.86 892.86 2 0 5 2 0 0 0 0 1 0 0 5 3 9.92 6.73 0 0 1 2600 14 516.67 916.67 716.67 2 4 6 2 0 0 0 0 4 0 0 6 5 12.99 12.75 4 100200 7 52550 24 563.64 977.27 770.45 2 8 9 4 1 0 0 0 4 0 0 8 5 12.14 26.46 8 110856 7 52550 41 697.44 1192.31 944.87 2 9 17 9 3 1 0 0 11 0 0 20 6 13.69 21.58 43 1478160 35 1572990 122 702.68 1196.43 949.55 10 29 45 25 10 3 0 11 27 0 4 67 10 5700000
3 4 2011-09-01 89 50.0 9.0 NaN NaN NaN NaN NaN NaN Investment Mitino 1.258354e+07 178473 0.194703 0.069753 13087 6839.0 9.0 13670 17063.0 10 0 NaN 1 0 17 6 no 0 11 4 no no no no no no no no 21155 9828 11327 28563 14680 13883 120381 60040 60341 29529 9083 20446 13087 6645 6442 13670 7126 6544 32063 16513 15550 3292 1450 1842 24934 12782 12152 458.0 9.0 51.0 12.0 124.0 50.0 201.0 0.0 9.0 2.0 459.0 13.0 24.0 40.0 130.0 252.0 4 1.489049 0.984537 11.565624 0.963802 0.179441 0.236455 0.078090 0.106125 0.451173 15.623710 2.017080 14.317640 4.291432 51.497190 4.0 3.816045 5.271136 4 0.131597 1.579164 1.200336 no 2.677824 14.606501 17.457198 18.309433 19.487005 2.677824 1 no 2.780449 17 1.999265 no 17.478380 83 6.734618 1 27.667863 9.522538 6.348716 1.767612 11.178333 0.583025 27.892717 0.811275 0.623484 1.950317 6.483172 7.385521 4.923843 3.549558 8.789894 2.163735 10.903161 0.622272 0.599914 0.934273 0.892674 0.236455 1.031777 1.561505 15.300449 16.990677 16.041521 5.029696 0.465820 good 17.36 0.57 0 0 0 0 2 1000.00 1500.00 1250.00 0 0 0 2 0 0 0 0 0 0 0 0 0 19.25 10.35 1 11000 6 80780 12 658.33 1083.33 870.83 0 3 4 5 0 0 0 0 0 0 0 3 1 28.38 6.57 2 11000 7 89492 23 673.91 1130.43 902.17 0 5 9 8 1 0 0 1 0 0 0 9 2 32.29 5.73 2 11000 7 89492 25 660.00 1120.00 890.00 0 5 11 8 1 0 0 1 1 0 0 13 2 20.79 3.57 4 167000 12 205756 32 718.75 1218.75 968.75 0 5 14 10 3 0 0 1 2 0 0 18 3 14.18 3.89 8 244166 22 942180 61 931.58 1552.63 1242.11 4 7 21 15 11 2 1 4 4 0 0 26 3 13100000
4 5 2011-09-05 77 77.0 4.0 NaN NaN NaN NaN NaN NaN Investment Basmannoe 8.398461e+06 108171 0.015234 0.037316 5706 3240.0 7.0 6748 7770.0 9 0 562.0 4 2 25 2 no 0 10 93 no no no yes yes no no no 28179 13522 14657 13368 7159 6209 68043 34236 33807 26760 8563 18197 5706 2982 2724 6748 3664 3084 15237 8113 7124 5164 2583 2581 11631 6223 5408 746.0 48.0 0.0 0.0 643.0 16.0 35.0 0.0 3.0 1.0 746.0 371.0 114.0 146.0 62.0 53.0 5 1.257186 0.876620 8.266305 0.688859 0.247901 0.376838 0.258289 0.236214 0.392871 10.683540 2.936581 11.903910 0.853960 10.247521 5.0 1.595898 2.156284 113 0.071480 0.857764 0.820294 no 11.616653 1.721834 0.046810 0.787593 2.578671 1.721834 4 no 3.133531 10 0.084113 yes 1.595898 113 1.423428 4 6.515857 8.671016 1.638318 3.632640 4.587917 2.609420 9.155057 1.969738 0.220288 2.544696 3.975401 3.610754 0.307915 1.864637 3.779781 1.121703 0.991683 0.892668 0.429052 0.077901 0.810801 0.376838 0.378756 0.121681 2.584370 1.112486 1.800125 1.339652 0.026102 excellent 3.56 4.44 15 293699 1 45000 48 702.22 1166.67 934.44 3 17 10 11 7 0 0 1 4 0 2 3 0 3.34 8.29 46 420952 3 158200 153 763.45 1272.41 1017.93 8 39 45 39 19 2 1 7 12 0 6 7 0 4.12 4.83 93 1195735 9 445900 272 766.80 1272.73 1019.76 19 70 74 72 30 6 1 18 30 0 10 14 2 4.53 5.02 149 1625130 17 564843 483 765.93 1269.23 1017.58 28 130 129 131 50 14 1 35 61 0 17 21 3 5.06 8.62 305 3420907 60 2296870 1068 853.03 1410.45 1131.74 63 266 267 262 149 57 4 70 121 1 40 77 5 8.38 10.92 689 8404624 114 3503058 2283 853.88 1411.45 1132.66 143 566 578 552 319 108 17 135 236 2 91 195 14 16331452

In [4]:
def preprocess_anomaly(df):
    df["full_sq"] = map(lambda x: x if x > 10 else float("NaN"), df["full_sq"])
    df["life_sq"] = map(lambda x: x if x > 5 else float("NaN"), df["life_sq"])
    df["kitch_sq"] = map(lambda x: x if x > 2 else float("NaN"), df["kitch_sq"])
    
    # superclean
    # https://www.kaggle.com/keremt/very-extensive-cleaning-by-sberbank-discussions
    df.ix[df[df.life_sq > df.full_sq].index, "life_sq"] = np.NaN
    df.ix[df[df.kitch_sq >= df.life_sq].index, "kitch_sq"] = np.NaN

    df.ix[df[df.kitch_sq == 0].index, "kitch_sq"] = np.NaN
    df.ix[df[df.kitch_sq == 1].index, "kitch_sq"] = np.NaN

    df.ix[df[df.num_room == 0].index, "num_room"] = np.NaN
    
    df.ix[df[df.floor == 0].index, "floor"] = np.NaN
    df.ix[df[df.max_floor == 0].index, "max_floor"] = np.NaN
    
    df.ix[df[df.floor > df.max_floor].index, "max_floor"] = np.NaN
    
    df.ix[df[df.state == 33].index, "state"] = np.NaN
    
    df.ix[df[df.build_year == 20052009].index, "build_year"] = 2005
    df.ix[df[df.build_year == 20].index, "build_year"] = 2000
    df.ix[df[df.build_year == 215].index, "build_year"] = 2015

    df.ix[df[df.build_year < 1500].index, "build_year"] = np.NaN
    df.ix[df[df.build_year > 2022].index, "build_year"] = np.NaN

    return df

In [5]:
def preprocess_categorial(df):

    for c in list(df.columns):
        if df[c].dtype == 'object':
            lbl = sk.preprocessing.LabelEncoder()
            try:
                lbl.fit(list(train_raw[c].values) + list(test[c].values)) 
            except KeyError:
                lbl.fit(df[c].values) 
            df[c + "_le"] = lbl.transform(list(df[c].values))

#     df = mess_y_categorial(df, 5)

    df = df.select_dtypes(exclude=['object'])
    return df

def apply_categorial(test, train):
    for c in list(test.columns):
        if test[c].dtype == 'object':
            lbl = sk.preprocessing.LabelEncoder()
            try:
                lbl.fit(list(train_raw[c].values) + list(test[c].values)) 
            except KeyError:
                lbl.fit(test[c].values) 
            test[c + "_le"] = lbl.transform(list(test[c].values))

#     test = mess_y_categorial_fold(test, train)

    test = test.select_dtypes(exclude=['object'])
    return test


def smoothed_likelihood(targ_mean, nrows, globalmean, alpha=10):
    try:
        return (targ_mean * nrows + globalmean * alpha) / (nrows + alpha)
    except Exception:
        return float("NaN")


def mess_y_categorial(df, nfolds=3, alpha=10):
    from copy import copy

    folds = np.array_split(df, nfolds)
    newfolds = []
    for i in range(nfolds):
        fold = folds[i]

        other_folds = copy(folds)
        other_folds.pop(i)
        other_fold = pd.concat(other_folds)

        newfolds.append(mess_y_categorial_fold(fold, other_fold, alpha=10))

    return pd.concat(newfolds)

def mess_y_categorial_fold(fold_raw, other_fold, cols=None, y_col="price_doc", alpha=10):
    fold = fold_raw.copy()
    if not cols:
        cols = list(fold.select_dtypes(include=["object"]).columns)
    globalmean = other_fold[y_col].mean()
    for c in cols:

        target_mean = other_fold[[c, y_col]].groupby(c).mean().to_dict()[y_col]
        nrows = other_fold[c].value_counts().to_dict()

        fold[c + "_sll"] = fold[c].apply(
            lambda x: smoothed_likelihood(target_mean.get(x), nrows.get(x), globalmean, alpha) if x else float("NaN")
        )
    return fold

def feature_exclude(df):
    feats = []

    with open("greedy_search.tsv") as gs:
        for line in gs:
            row = line.strip().split("\t")
            if len(row) < 6:
                continue
            if row[5] == "remove":
                feats.append(row[0])
    if feats:
        df = df.drop(feats, axis=1)
    return df

In [6]:
def apply_macro(df):
    macro_cols = [
        'timestamp', "balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
        "micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
        "income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"
    ]
    return pd.merge(df, macro, on='timestamp', how='left')

In [7]:
def preprocess(df):
    from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
    
#     df = apply_macro(df)

    ecology = ["no data", "poor", "satisfactory", "good", "excellent"]
    df["ecology_index"] = map(ecology.index, df["ecology"].values)

    df["age_of_building"] = df["timestamp"].apply(lambda x: x.split("-")[0]).astype(int) - df["build_year"]
    df["is_build_in_progress"] = df["age_of_building"].apply(lambda x: "yes" if x < 0 else "no")

    bool_feats = [
        "thermal_power_plant_raion",
        "incineration_raion",
        "oil_chemistry_raion",
        "radiation_raion",
        "railroad_terminal_raion",
        "big_market_raion",
        "nuclear_reactor_raion",
        "detention_facility_raion",
        "water_1line",
        "big_road1_1line",
        "railroad_1line",
        "culture_objects_top_25"
    ]
    for bf in bool_feats:
        df[bf + "_bool"] = map(lambda x: x == "yes", df[bf].values)

    df = preprocess_anomaly(df)

    df['rel_floor'] = df['floor'] / df['max_floor'].astype(float)
    df['rel_kitch_sq'] = df['kitch_sq'] / df['full_sq'].astype(float)
    df['rel_life_sq'] = df['life_sq'] / df['full_sq'].astype(float)

    df["material_cat"] = df.material.fillna(0).astype(int).astype(str).replace("0", "")
    df["state_cat"] = df.state.fillna(0).astype(int).astype(str).replace("0", "")
    df["num_room_cat"] = df.num_room.fillna(0).astype(int).astype(str).replace("0", "")
    df["build_year_cat"] = df.build_year.fillna(0).astype(int).astype(str).replace("0", "")

    df["ID_metro"] = df.ID_metro.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_railroad_station_walk"] = df.ID_railroad_station_walk.replace("", "-10").fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_railroad_station_avto"] = df.ID_railroad_station_avto.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_big_road1"] = df.ID_big_road1.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_big_road2"] = df.ID_big_road2.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_bus_terminal"] = df.ID_bus_terminal.fillna(-10).astype(int).astype(str).replace("-10", "")

#    # ratio of living area to full area #
#     df["ratio_life_sq_full_sq"] = df["life_sq"] / np.maximum(df["full_sq"].astype("float"),1)
#     df["ratio_life_sq_full_sq"].ix[df["ratio_life_sq_full_sq"]<0] = 0
#     df["ratio_life_sq_full_sq"].ix[df["ratio_life_sq_full_sq"]>1] = 1

#     # ratio of kitchen area to living area #
#     df["ratio_kitch_sq_life_sq"] = df["kitch_sq"] / np.maximum(df["life_sq"].astype("float"),1)
#     df["ratio_kitch_sq_life_sq"].ix[df["ratio_kitch_sq_life_sq"]<0] = 0
#     df["ratio_kitch_sq_life_sq"].ix[df["ratio_kitch_sq_life_sq"]>1] = 1

#     # ratio of kitchen area to full area #
#     df["ratio_kitch_sq_full_sq"] = df["kitch_sq"] / np.maximum(df["full_sq"].astype("float"),1)
#     df["ratio_kitch_sq_full_sq"].ix[df["ratio_kitch_sq_full_sq"]<0] = 0
#     df["ratio_kitch_sq_full_sq"].ix[df["ratio_kitch_sq_full_sq"]>1] = 1

    df = df.drop(["id", "timestamp"], axis=1)

    return df

In [8]:
train_pr = preprocess(train_raw)
train_pr = preprocess_categorial(train_pr)
train = feature_exclude(train_pr)
train["price_meter"] = train["price_doc"] / train["full_sq"]
# train = train.fillna(-1)

X = train.drop(["price_doc", "price_meter"], axis=1)
y = train["price_meter"].values

Обучение моделей


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X.values, y, test_size=0.20, random_state=43)

dtrain_all = xgb.DMatrix(X.values, y, feature_names=X.columns)
dtrain = xgb.DMatrix(X_train, y_train, feature_names=X.columns)
dval = xgb.DMatrix(X_val, y_val, feature_names=X.columns)

In [11]:
xgb_params = {
    'max_depth': 5,
    'n_estimators': 200,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Uncomment to tune XGB `num_boost_rounds`
model = xgb.train(xgb_params, dtrain, num_boost_round=4000, evals=[(dval, 'val')],
                  early_stopping_rounds=40, verbose_eval=40)

num_boost_round = model.best_iteration


[0]	val-rmse:8.28101e+06
Will train until val-rmse hasn't improved in 40 rounds.
[40]	val-rmse:5.9641e+06
[80]	val-rmse:4.52109e+06
[120]	val-rmse:3.67296e+06
[160]	val-rmse:3.19137e+06
[200]	val-rmse:2.93826e+06
[240]	val-rmse:2.80753e+06
[280]	val-rmse:2.73126e+06
[320]	val-rmse:2.69304e+06
[360]	val-rmse:2.66852e+06
[400]	val-rmse:2.65234e+06
[440]	val-rmse:2.64059e+06
[480]	val-rmse:2.62986e+06
[520]	val-rmse:2.62135e+06
[560]	val-rmse:2.615e+06
[600]	val-rmse:2.61025e+06
[640]	val-rmse:2.60486e+06
[680]	val-rmse:2.6013e+06
[720]	val-rmse:2.5986e+06
[760]	val-rmse:2.59474e+06
[800]	val-rmse:2.59211e+06
[840]	val-rmse:2.58855e+06
[880]	val-rmse:2.58689e+06
[920]	val-rmse:2.58484e+06
[960]	val-rmse:2.5832e+06
[1000]	val-rmse:2.58156e+06
[1040]	val-rmse:2.58041e+06
[1080]	val-rmse:2.57886e+06
[1120]	val-rmse:2.57719e+06
[1160]	val-rmse:2.57494e+06
[1200]	val-rmse:2.57372e+06
[1240]	val-rmse:2.5726e+06
[1280]	val-rmse:2.57168e+06
[1320]	val-rmse:2.57047e+06
[1360]	val-rmse:2.57015e+06
[1400]	val-rmse:2.56956e+06
[1440]	val-rmse:2.5693e+06
[1480]	val-rmse:2.56842e+06
[1520]	val-rmse:2.56798e+06
[1560]	val-rmse:2.56696e+06
[1600]	val-rmse:2.56633e+06
[1640]	val-rmse:2.56568e+06
[1680]	val-rmse:2.56514e+06
[1720]	val-rmse:2.56488e+06
[1760]	val-rmse:2.56442e+06
[1800]	val-rmse:2.56438e+06
[1840]	val-rmse:2.56362e+06
[1880]	val-rmse:2.56228e+06
[1920]	val-rmse:2.56186e+06
[1960]	val-rmse:2.56099e+06
[2000]	val-rmse:2.56053e+06
[2040]	val-rmse:2.56019e+06
[2080]	val-rmse:2.55984e+06
[2120]	val-rmse:2.55942e+06
[2160]	val-rmse:2.55912e+06
[2200]	val-rmse:2.55899e+06
[2240]	val-rmse:2.55863e+06
[2280]	val-rmse:2.55832e+06
Stopping. Best iteration:
[2266]	val-rmse:2.55793e+06


In [12]:
imp_features = pd.DataFrame(
    model.get_fscore().items(), 
    columns=['feature','importance']
).sort_values('importance', ascending=False)
imp_features


Out[12]:
feature importance
88 full_sq 7121
161 life_sq 2326
201 floor 2085
17 max_floor 1354
158 build_year 1343
31 rel_life_sq 1186
9 rel_kitch_sq 1064
202 rel_floor 879
76 kitch_sq 732
89 state 687
102 age_of_building 672
167 kindergarten_km 579
135 metro_min_avto 552
139 railroad_km 546
75 industrial_km 505
35 material 495
49 public_transport_station_min_walk 485
68 area_m 478
5 radiation_km 470
94 mosque_km 453
53 public_healthcare_km 442
128 green_zone_km 441
197 swim_pool_km 428
173 num_room 418
231 prom_part_3000 415
119 university_km 391
30 water_km 388
64 big_church_km 385
204 school_km 367
159 hospital_beds_raion 350
65 workplaces_km 349
179 fitness_km 329
38 railroad_station_walk_min 327
117 prom_part_5000 326
200 green_part_1000 326
149 green_part_1500 323
74 cafe_sum_5000_min_price_avg 322
175 cemetery_km 322
221 church_synagogue_km 321
26 zd_vokzaly_avto_km 314
112 office_km 314
86 railroad_station_avto_km 312
61 market_shop_km 308
156 power_transmission_line_km 303
25 catering_km 300
190 metro_km_walk 299
227 ttk_km 297
40 hospice_morgue_km 297
55 ice_rink_km 291
165 additional_education_km 288
137 detention_facility_km 288
208 ts_km 286
205 big_market_km 278
170 big_road1_km 275
130 green_part_500 263
176 green_zone_part 262
20 market_count_1000 259
182 big_road2_km 259
184 theater_km 251
220 cafe_sum_1000_min_price_avg 241
216 prom_part_1500 240
129 indust_part 240
168 oil_chemistry_km 234
141 cafe_count_2000 232
81 cafe_sum_500_min_price_avg 232
209 ID_railroad_station_walk_le 231
78 green_part_5000 230
77 mkad_km 225
37 park_km 224
143 nuclear_reactor_km 224
41 metro_km_avto 220
58 basketball_km 216
142 incineration_km 211
162 stadium_km 207
146 sadovoe_km 198
71 cafe_count_500 197
80 shopping_centers_km 197
218 preschool_km 195
6 trc_sqm_2000 190
169 thermal_power_plant_km 187
133 office_sqm_5000 187
8 trc_sqm_3000 186
34 museum_km 177
132 office_sqm_2000 176
0 office_sqm_1500 174
138 prom_part_1000 173
224 office_sqm_3000 172
7 cafe_count_5000_price_2500 169
44 preschool_quota 164
12 cafe_count_1000 162
228 product_type_le 161
52 school_quota 155
155 trc_sqm_5000 150
206 green_part_3000 147
154 green_part_2000 146
178 trc_sqm_1000 143
144 cafe_count_3000 142
103 build_count_monolith 142
240 build_count_1971-1995 142
212 cafe_sum_2000_min_price_avg 134
29 trc_sqm_1500 134
100 prom_part_500 130
194 office_count_1000 128
140 full_all 127
84 sport_count_3000 126
213 ID_metro_le 124
3 sport_count_5000 123
70 big_church_count_3000 122
234 cafe_count_5000_price_high 117
151 cafe_sum_1500_min_price_avg 117
59 cafe_count_5000 115
19 church_count_1000 115
56 prom_part_2000 113
172 cafe_count_2000_price_2500 111
47 railroad_station_avto_min 109
153 cafe_sum_3000_min_price_avg 108
39 sport_count_1500 105
66 exhibition_km 104
92 cafe_count_1000_price_high 103
90 cafe_count_1500_price_500 100
108 sport_objects_raion 100
203 cafe_count_2000_price_1000 99
229 church_count_5000 94
60 sub_area_le 94
232 cafe_count_1000_price_2500 94
33 cafe_sum_500_max_price_avg 92
87 office_sqm_1000 92
121 raion_popul 88
27 office_count_500 88
11 build_count_brick 86
14 build_count_block 85
123 trc_count_2000 85
215 cafe_avg_price_500 84
24 market_count_3000 82
171 ekder_male 81
239 sport_count_1000 81
192 cafe_count_5000_price_4000 81
23 trc_count_5000 80
219 cafe_count_1500 79
85 cafe_count_3000_price_2500 78
225 cafe_count_3000_price_500 78
166 bulvar_ring_km 77
109 preschool_education_centers_raion 77
237 trc_count_1500 75
193 culture_objects_top_25_raion 71
4 cafe_count_500_price_1000 70
2 trc_count_1000 70
28 cafe_sum_1000_max_price_avg 68
136 cafe_count_500_na_price 67
63 cafe_count_1500_price_1500 67
15 ID_big_road2_le 67
187 cafe_count_1000_na_price 63
163 build_count_panel 63
67 office_count_2000 63
69 cafe_count_1000_price_1500 62
98 church_count_2000 60
183 cafe_count_3000_price_4000 60
45 trc_count_3000 59
104 market_count_1500 59
116 cafe_count_2000_price_500 57
101 cafe_count_1500_price_2500 57
105 cafe_avg_price_2000 56
217 cafe_count_5000_na_price 56
36 16_29_all 56
127 cafe_count_1500_price_4000 55
191 cafe_sum_3000_max_price_avg 54
113 office_count_1500 53
199 big_church_count_1500 53
150 build_count_before_1920 53
145 sport_count_2000 52
236 cafe_sum_2000_max_price_avg 52
122 cafe_count_5000_price_1000 52
185 sport_count_500 51
50 cafe_count_1000_price_1000 51
13 big_church_count_2000 48
111 cafe_count_5000_price_1500 45
1 cafe_count_3000_na_price 45
96 cafe_count_1500_price_1000 45
189 raion_build_count_with_material_info 44
147 cafe_count_3000_price_high 44
57 trc_sqm_500 43
177 school_education_centers_top_20_raion 42
181 office_count_3000 42
214 ID_big_road1_le 41
233 leisure_count_3000 41
73 cafe_sum_1500_max_price_avg 39
48 female_f 38
10 work_all 36
238 thermal_power_plant_raion_le 35
32 cafe_count_2000_price_1500 34
93 shopping_centers_raion 33
126 cafe_count_1000_price_500 31
196 cafe_count_1500_na_price 31
46 cafe_count_2000_price_4000 31
164 ecology_le 30
97 cafe_avg_price_1500 30
188 leisure_count_5000 28
22 cafe_count_2000_na_price 28
120 children_preschool 27
42 kremlin_km 27
99 church_count_3000 26
207 build_count_1921-1945 26
186 build_count_slag 26
195 big_church_count_1000 26
235 cafe_count_5000_price_500 25
114 office_raion 24
174 cafe_count_2000_price_high 24
230 cafe_count_500_price_500 24
125 healthcare_centers_raion 24
134 build_count_after_1995 24
131 ekder_all 21
83 mosque_count_5000 21
95 leisure_count_1000 20
152 big_church_count_5000 19
110 young_all 18
16 cafe_count_1500_price_high 17
198 trc_count_500 16
62 cafe_avg_price_1000 16
106 market_count_2000 14
43 university_top_20_raion 14
118 leisure_count_2000 13
160 ekder_female 12
124 num_room_cat_le 11
180 16_29_female 11
115 ID_railroad_terminal 11
82 ID_bus_terminal_le 10
223 male_f 9
222 railroad_terminal_raion_le 8
226 build_count_frame 8
51 leisure_count_500 8
21 cafe_avg_price_3000 7
211 build_count_wood 6
157 16_29_male 6
79 7_14_all 6
210 cafe_count_500_price_high 5
72 church_count_500 5
148 leisure_count_1500 4
91 7_14_female 4
54 big_church_count_500 4
107 build_count_mix 2
18 7_14_male 1

In [13]:
cv_output = xgb.cv(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round, verbose_eval=40)
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()


[0]	train-rmse:8.41649e+06+41741.5	test-rmse:8.41749e+06+84540.3
[40]	train-rmse:6.03322e+06+28199.8	test-rmse:6.0902e+06+71143.3
[80]	train-rmse:4.52591e+06+24294.8	test-rmse:4.65295e+06+63603
[120]	train-rmse:3.59694e+06+24673.6	test-rmse:3.799e+06+61408.8
[160]	train-rmse:3.03773e+06+23604.7	test-rmse:3.31908e+06+62720.6
[200]	train-rmse:2.70512e+06+20565.3	test-rmse:3.05728e+06+62195
[240]	train-rmse:2.50918e+06+21138.6	test-rmse:2.91332e+06+61563.8
[280]	train-rmse:2.3916e+06+20671.9	test-rmse:2.83512e+06+61156.3
[320]	train-rmse:2.3135e+06+20144.3	test-rmse:2.79042e+06+59205.5
[360]	train-rmse:2.25861e+06+19201.1	test-rmse:2.7633e+06+58681.9
[400]	train-rmse:2.21503e+06+19435.8	test-rmse:2.74456e+06+57977.8
[440]	train-rmse:2.18046e+06+19601.1	test-rmse:2.73086e+06+57811.9
[480]	train-rmse:2.15196e+06+19933	test-rmse:2.72007e+06+58000.3
[520]	train-rmse:2.12786e+06+19380.8	test-rmse:2.71168e+06+57381.1
[560]	train-rmse:2.10665e+06+19583.9	test-rmse:2.70399e+06+57641.7
[600]	train-rmse:2.08826e+06+19258.9	test-rmse:2.69753e+06+58358.9
[640]	train-rmse:2.07184e+06+19879.9	test-rmse:2.69279e+06+58757.2
[680]	train-rmse:2.05778e+06+20663.5	test-rmse:2.68847e+06+59238.1
[720]	train-rmse:2.04465e+06+21183.1	test-rmse:2.68437e+06+59468.3
[760]	train-rmse:2.03197e+06+22094.4	test-rmse:2.68174e+06+59738.9
[800]	train-rmse:2.01994e+06+22527.7	test-rmse:2.6796e+06+60355
[840]	train-rmse:2.00752e+06+23233.5	test-rmse:2.6776e+06+61028.3
[880]	train-rmse:1.99688e+06+23938	test-rmse:2.67548e+06+61020.5
[920]	train-rmse:1.98564e+06+24815.2	test-rmse:2.67365e+06+61223.1
[960]	train-rmse:1.97503e+06+25357.7	test-rmse:2.67183e+06+61828.3
[1000]	train-rmse:1.96528e+06+25780.7	test-rmse:2.66991e+06+61944.8
[1040]	train-rmse:1.95524e+06+26162.3	test-rmse:2.66828e+06+61943.5
[1080]	train-rmse:1.94577e+06+26766.3	test-rmse:2.66691e+06+62111.4
[1120]	train-rmse:1.93633e+06+27168.4	test-rmse:2.66531e+06+62610.9
[1160]	train-rmse:1.92735e+06+27697.8	test-rmse:2.66396e+06+62549.7
[1200]	train-rmse:1.91923e+06+28150.4	test-rmse:2.66314e+06+62591.8
[1240]	train-rmse:1.91119e+06+27365.7	test-rmse:2.66202e+06+62661.6
[1280]	train-rmse:1.90317e+06+27745.6	test-rmse:2.66117e+06+62582.7
[1320]	train-rmse:1.89484e+06+28139.7	test-rmse:2.66045e+06+62849.1
[1360]	train-rmse:1.88751e+06+28285.6	test-rmse:2.65967e+06+62919.8
[1400]	train-rmse:1.87949e+06+28329.9	test-rmse:2.65891e+06+63000.5
[1440]	train-rmse:1.87197e+06+28878.6	test-rmse:2.65814e+06+63109.3
[1480]	train-rmse:1.86484e+06+29134.8	test-rmse:2.65735e+06+63265.3
[1520]	train-rmse:1.85755e+06+29508.2	test-rmse:2.65652e+06+63467.6
[1560]	train-rmse:1.85066e+06+29164.2	test-rmse:2.65604e+06+63503.9
[1600]	train-rmse:1.84309e+06+29768.5	test-rmse:2.65539e+06+63461
[1640]	train-rmse:1.83616e+06+30119.4	test-rmse:2.65523e+06+63711.9
[1680]	train-rmse:1.82954e+06+29955.3	test-rmse:2.65466e+06+63606.9
[1720]	train-rmse:1.82287e+06+29298.2	test-rmse:2.65411e+06+63726.8
[1760]	train-rmse:1.81641e+06+29519.6	test-rmse:2.65359e+06+63735.6
[1800]	train-rmse:1.80996e+06+30001.5	test-rmse:2.65317e+06+63935.4
[1840]	train-rmse:1.80373e+06+30423.3	test-rmse:2.65293e+06+63938.3
[1880]	train-rmse:1.79744e+06+30126.6	test-rmse:2.65244e+06+63925.3
[1920]	train-rmse:1.79116e+06+29301.7	test-rmse:2.65217e+06+64012.4
[1960]	train-rmse:1.78525e+06+29547.1	test-rmse:2.65183e+06+63809.9
[2000]	train-rmse:1.77881e+06+28807.9	test-rmse:2.65152e+06+63886.1
[2040]	train-rmse:1.77238e+06+29150	test-rmse:2.65144e+06+63912.2
[2080]	train-rmse:1.76645e+06+29024.9	test-rmse:2.65132e+06+64081.5
[2120]	train-rmse:1.76011e+06+29154.1	test-rmse:2.65117e+06+64255.5
[2160]	train-rmse:1.75358e+06+29313.6	test-rmse:2.65090e+06+64653
[2200]	train-rmse:1.74676e+06+28968.2	test-rmse:2.65044e+06+64838.8
[2240]	train-rmse:1.74066e+06+28727.3	test-rmse:2.65025e+06+64969.5
Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1120a7410>

In [14]:
model = xgb.train(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round, verbose_eval=40)
print "predict-train:", rmse(model.predict(dtrain_all), y)


predict-train: 1881896.66663

Submission


In [15]:
test_pr = preprocess(test)
train_pr = preprocess(train_raw)
test_pr = apply_categorial(test_pr, train_pr)
test_pr = feature_exclude(test_pr)
# test_pr = test_pr.fillna(-1)c

dtest = xgb.DMatrix(test_pr.values, feature_names=test_pr.columns)
y_pred = model.predict(dtest)

# y_pred = model.predict(test_pr.values)

# y_pred = np.exp(y_pred) - 1

submdf = pd.DataFrame({"id": test["id"], "price_doc": y_pred})
submdf.to_csv("data/submission.csv", header=True, index=False)
!head data/submission.csv


id,price_doc
30474,5568339.0
30475,8274571.5
30476,5637723.5
30477,5850102.5
30478,5368623.0
30479,8247314.5
30480,4413024.0
30481,3841126.5
30482,4725561.5

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude:

val-rmse:2.55793e+06
train-rmse:1.74066e+06+28727.3  test-rmse:2.65025e+06+64969.5
predict-train: 1881896.66663
kaggle: 0.31344

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:

val-rmse:2.54654e+06
train-rmse:1.74594e+06+24020    test-rmse:2.66053e+06+67300.3
predict-train: 1883352.60935
kaggle: 0.31364

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:

val-rmse:2.55613e+06
train-rmse:1.74466e+06+27385.6  test-rmse:2.66422e+06+69734.1
predict-train: 1888051.35357
kaggle: 0.31366


5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro with other ID, ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:

val-rmse:2.58557e+06
train-rmse:1.98509e+06+26803.7  test-rmse:2.68755e+06+59691.1
predict-train: 2092731.29028
kaggle: 0.31731

#

5*200, no macro, add rel features, no log price, train_without_noise:

val-rmse:2.63772e+06
train-rmse:1.9989e+06+10986.4   test-rmse:2.69158e+06+53020
predict-train: 2076010.27131
kaggle: 0.31720

5*200, no macro, add rel features, no log price, train_with_noise:

val-rmse:2.53378e+06
train-rmse:1.95069e+06+16166.4  test-rmse:2.69703e+06+61455.1
predict-train: 2054421.59869
kaggle: 0.32056

5*200, macro, add rel features, no log price, train_without_noise:

val-rmse:2.79632e+06
train-rmse:1.81015e+06+19781.2  test-rmse:2.6641e+06+123875
predict-train: 1904063.27368
kaggle: 0.32976

5*200, no macro, add rel features, no log price, train_without_noise:

val-rmse:2.61682e+06
train-rmse:1.81123e+06+27681.2  test-rmse:2.66923e+06+53925.7
predict-train: 1899129.43771
kaggle: 0.31592

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.61055e+06
train-rmse:1.71826e+06+30076.1  test-rmse:2.66515e+06+54583.5
predict-train: 1814572.97424
kaggle: 0.31602

7*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.59955e+06
train-rmse:1.41393e+06+21208.1  test-rmse:2.6763e+06+35553.3
predict-train: 1548257.49121
kaggle: 0.31768

4*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.63407e+06
train-rmse:1.96513e+06+21470.8  test-rmse:2.69417e+06+74288.3
predict-train: 2062299.41091
kaggle: 0.31952

7*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.59955e+06
train-rmse:1.41393e+06+21208.1  test-rmse:2.6763e+06+35553.3
predict-train: 1548257.49121

5*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.61055e+06
train-rmse:1.71826e+06+30076.1  test-rmse:2.66515e+06+54583.5
predict-train: 1814572.97424

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna:

val-rmse:2.61664e+06
train-rmse:1.77892e+06+23111    test-rmse:2.65829e+06+56398.6
predict-train: 1875799.54634
kaggle: 0.31521

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean:

val-rmse:2.6265e+06
train-rmse:1.78478e+06+22545.4  test-rmse:2.66179e+06+60626.3
predict-train: 1881672.27588
kaggle: 0.31476

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, no super features + Label Encoding:

val-rmse:2.56494e+06
train-rmse:1.78862e+06+18589.1  test-rmse:2.69283e+06+79861.4
predict-train: 1923466.41923
kaggle: 0.31434

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, remove material state num_room:

val-rmse:2.56932e+06
train-rmse:1.88495e+06+20133.7  test-rmse:2.69624e+06+70491.2
predict-train: 1979198.19201
kaggle: 0.31513

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro/bus...:

val-rmse:2.60017e+06
train-rmse:1.80654e+06+19453.5  test-rmse:2.68203e+06+68169.5
predict-train: 1906439.98603
kaggle: 0.31927

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, remove 50 features:

val-rmse:2.93665e+06
train-rmse:1.73425e+06+19462.4  test-rmse:2.68682e+06+140661
predict-train: 1861268.6455
kaggle: 0.31555

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, remove 50 features, add ratio feats:

val-rmse:2.59747e+06
train-rmse:1.75828e+06+26639.4  test-rmse:2.68491e+06+67201.8
predict-train: 1875707.6581
kaggle: 0.31760

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, superfeatures + Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle:

val-rmse:2.5419e+06
train-rmse:1.74381e+06+22710.7  test-rmse:2.65787e+06+66889.9
predict-train: 1862467.67153
kaggle: 0.31716

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle:

val-rmse:2.5676e+06
train-rmse:1.81485e+06+24274    test-rmse:2.67324e+06+60153.1
predict-train: 1947645.83102
kaggle: 0.31376

Feature Greedy selection


In [45]:
from tqdm import tqdm
def get_best_score(train):
    xgb_params = {
        'max_depth': 5,
        'n_estimators': 200,
        'learning_rate': 0.01,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }
    cvres = xgb.cv(xgb_params, train, num_boost_round=4000, early_stopping_rounds=40)
    return cvres["test-rmse-mean"].min(), cvres["test-rmse-mean"].argmin()

def df2DMatrix(df):
    return xgb.DMatrix(data=df.drop("price_doc", axis=1).values, label=df["price_doc"].values)

def greedy_remove_features(df, feature_importances):
    train = df
    with open("greedy_search.tsv", "a") as f:
        best_score, iterno = get_best_score(df2DMatrix(df))
        f.write("\t".join(["INITIAL", str(best_score), str(iterno)]) + "\n")
        to_analyze = sorted(feature_importances.items(), key=lambda x: x[1])
        for feat, feat_importance in tqdm(to_analyze):
            f.flush()
            candidate_train = train.drop(feat, axis=1)
            cand_best_score, iterno = get_best_score(df2DMatrix(candidate_train))

            if cand_best_score > best_score:
                # стало хуже, оставляем фичу
                f.write("\t".join([feat, str(cand_best_score), str(best_score), str(feat_importance), str(iterno), "skip"]) + "\n")
                f.flush()
                continue

            f.write("\t".join([feat, str(cand_best_score), str(best_score), str(feat_importance), str(iterno), "remove"]) + "\n")
            best_score = cand_best_score
            train = candidate_train

In [47]:
feature_importances = imp_features.set_index("feature").to_dict()["importance"]

train_gs = train
with open("greedy_search.tsv") as gs:
    for line in gs:
        row = line.strip().split("\t")
        if len(row) < 6:
            continue
        if row[5] == "remove":
            try:
                train_gs = train_gs.drop(row[0], axis=1)
            except ValueError:
                pass
            print "drop", row[0]
        feature_importances.pop(row[0], None)

greedy_remove_features(train_gs, feature_importances)


drop 0_6_female
drop young_female
drop market_count_500
drop cafe_count_500_price_4000
drop nuclear_reactor_raion_bool
drop work_male
drop radiation_raion_bool
drop 0_13_female
drop detention_facility_raion_bool
drop thermal_power_plant_raion_bool
drop work_female
drop ecology_index
drop 0_17_male
drop railroad_terminal_raion_bool
drop church_count_1500
drop big_road1_1line_bool
drop additional_education_raion
drop cafe_count_1000_price_4000
drop cafe_count_3000_price_1500
drop office_count_5000
drop children_school
drop cafe_avg_price_5000
drop build_count_1946-1970
drop school_education_centers_raion
drop build_count_foam
drop market_count_5000
drop cafe_count_3000_price_1000
drop cafe_sum_5000_max_price_avg
drop cafe_count_500_price_2500
drop cafe_count_500_price_1500
100%|██████████| 123/123 [20:40:45<00:00, 613.14s/it]  

In [ ]: