In [1]:

    
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline
%load_ext ipycache

import pandas as pd
pd.options.display.max_rows = 999
pd.options.display.max_columns = 300
import numpy as np
import scipy
import sklearn as sk
import xgboost as xgb

from eli5 import show_weights

import seaborn as sns
sns.set()

import matplotlib.pyplot as plt









    



/Users/evgeny/Library/Python/2.7/lib/python/site-packages/IPython/config.py:13: ShimWarning: The `IPython.config` package has been deprecated since IPython 4.0. You should import from traitlets.config instead.
  "You should import from traitlets.config instead.", ShimWarning)
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/IPython/utils/traitlets.py:5: UserWarning: IPython.utils.traitlets has moved to a top-level traitlets package.
  warn("IPython.utils.traitlets has moved to a top-level traitlets package.")
/Users/evgeny/Library/Python/2.7/lib/python/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [2]:

    
import math

#A function to calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [
        (math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 
        for i,pred in enumerate(y_pred)
    ]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

def rmse(y, y_pred):
    return np.sqrt(((y_pred - y) ** 2).mean())

Препроцессинг фич



In [3]:

    
# train_raw = pd.read_csv("data/train.csv")
train_raw = pd.read_csv("data/train_without_noise.csv")
test = pd.read_csv("data/test.csv")
macro = pd.read_csv("data/macro.csv")
train_raw.head()









    Out[3]:







  
    
      
      id
      timestamp
      full_sq
      life_sq
      floor
      max_floor
      material
      build_year
      num_room
      kitch_sq
      state
      product_type
      sub_area
      area_m
      raion_popul
      green_zone_part
      indust_part
      children_preschool
      preschool_quota
      preschool_education_centers_raion
      children_school
      school_quota
      school_education_centers_raion
      school_education_centers_top_20_raion
      hospital_beds_raion
      healthcare_centers_raion
      university_top_20_raion
      sport_objects_raion
      additional_education_raion
      culture_objects_top_25
      culture_objects_top_25_raion
      shopping_centers_raion
      office_raion
      thermal_power_plant_raion
      incineration_raion
      oil_chemistry_raion
      radiation_raion
      railroad_terminal_raion
      big_market_raion
      nuclear_reactor_raion
      detention_facility_raion
      full_all
      male_f
      female_f
      young_all
      young_male
      young_female
      work_all
      work_male
      work_female
      ekder_all
      ekder_male
      ekder_female
      0_6_all
      0_6_male
      0_6_female
      7_14_all
      7_14_male
      7_14_female
      0_17_all
      0_17_male
      0_17_female
      16_29_all
      16_29_male
      16_29_female
      0_13_all
      0_13_male
      0_13_female
      raion_build_count_with_material_info
      build_count_block
      build_count_wood
      build_count_frame
      build_count_brick
      build_count_monolith
      build_count_panel
      build_count_foam
      build_count_slag
      build_count_mix
      raion_build_count_with_builddate_info
      build_count_before_1920
      build_count_1921-1945
      build_count_1946-1970
      build_count_1971-1995
      build_count_after_1995
      ID_metro
      metro_min_avto
      metro_km_avto
      metro_min_walk
      metro_km_walk
      kindergarten_km
      school_km
      park_km
      green_zone_km
      industrial_km
      water_treatment_km
      cemetery_km
      incineration_km
      railroad_station_walk_km
      railroad_station_walk_min
      ID_railroad_station_walk
      railroad_station_avto_km
      railroad_station_avto_min
      ID_railroad_station_avto
      public_transport_station_km
      public_transport_station_min_walk
      water_km
      water_1line
      mkad_km
      ttk_km
      sadovoe_km
      bulvar_ring_km
      kremlin_km
      big_road1_km
      ID_big_road1
      big_road1_1line
      big_road2_km
      ID_big_road2
      railroad_km
      railroad_1line
      zd_vokzaly_avto_km
      ID_railroad_terminal
      bus_terminal_avto_km
      ID_bus_terminal
      oil_chemistry_km
      nuclear_reactor_km
      radiation_km
      power_transmission_line_km
      thermal_power_plant_km
      ts_km
      big_market_km
      market_shop_km
      fitness_km
      swim_pool_km
      ice_rink_km
      stadium_km
      basketball_km
      hospice_morgue_km
      detention_facility_km
      public_healthcare_km
      university_km
      workplaces_km
      shopping_centers_km
      office_km
      additional_education_km
      preschool_km
      big_church_km
      church_synagogue_km
      mosque_km
      theater_km
      museum_km
      exhibition_km
      catering_km
      ecology
      green_part_500
      prom_part_500
      office_count_500
      office_sqm_500
      trc_count_500
      trc_sqm_500
      cafe_count_500
      cafe_sum_500_min_price_avg
      cafe_sum_500_max_price_avg
      cafe_avg_price_500
      cafe_count_500_na_price
      cafe_count_500_price_500
      cafe_count_500_price_1000
      cafe_count_500_price_1500
      cafe_count_500_price_2500
      cafe_count_500_price_4000
      cafe_count_500_price_high
      big_church_count_500
      church_count_500
      mosque_count_500
      leisure_count_500
      sport_count_500
      market_count_500
      green_part_1000
      prom_part_1000
      office_count_1000
      office_sqm_1000
      trc_count_1000
      trc_sqm_1000
      cafe_count_1000
      cafe_sum_1000_min_price_avg
      cafe_sum_1000_max_price_avg
      cafe_avg_price_1000
      cafe_count_1000_na_price
      cafe_count_1000_price_500
      cafe_count_1000_price_1000
      cafe_count_1000_price_1500
      cafe_count_1000_price_2500
      cafe_count_1000_price_4000
      cafe_count_1000_price_high
      big_church_count_1000
      church_count_1000
      mosque_count_1000
      leisure_count_1000
      sport_count_1000
      market_count_1000
      green_part_1500
      prom_part_1500
      office_count_1500
      office_sqm_1500
      trc_count_1500
      trc_sqm_1500
      cafe_count_1500
      cafe_sum_1500_min_price_avg
      cafe_sum_1500_max_price_avg
      cafe_avg_price_1500
      cafe_count_1500_na_price
      cafe_count_1500_price_500
      cafe_count_1500_price_1000
      cafe_count_1500_price_1500
      cafe_count_1500_price_2500
      cafe_count_1500_price_4000
      cafe_count_1500_price_high
      big_church_count_1500
      church_count_1500
      mosque_count_1500
      leisure_count_1500
      sport_count_1500
      market_count_1500
      green_part_2000
      prom_part_2000
      office_count_2000
      office_sqm_2000
      trc_count_2000
      trc_sqm_2000
      cafe_count_2000
      cafe_sum_2000_min_price_avg
      cafe_sum_2000_max_price_avg
      cafe_avg_price_2000
      cafe_count_2000_na_price
      cafe_count_2000_price_500
      cafe_count_2000_price_1000
      cafe_count_2000_price_1500
      cafe_count_2000_price_2500
      cafe_count_2000_price_4000
      cafe_count_2000_price_high
      big_church_count_2000
      church_count_2000
      mosque_count_2000
      leisure_count_2000
      sport_count_2000
      market_count_2000
      green_part_3000
      prom_part_3000
      office_count_3000
      office_sqm_3000
      trc_count_3000
      trc_sqm_3000
      cafe_count_3000
      cafe_sum_3000_min_price_avg
      cafe_sum_3000_max_price_avg
      cafe_avg_price_3000
      cafe_count_3000_na_price
      cafe_count_3000_price_500
      cafe_count_3000_price_1000
      cafe_count_3000_price_1500
      cafe_count_3000_price_2500
      cafe_count_3000_price_4000
      cafe_count_3000_price_high
      big_church_count_3000
      church_count_3000
      mosque_count_3000
      leisure_count_3000
      sport_count_3000
      market_count_3000
      green_part_5000
      prom_part_5000
      office_count_5000
      office_sqm_5000
      trc_count_5000
      trc_sqm_5000
      cafe_count_5000
      cafe_sum_5000_min_price_avg
      cafe_sum_5000_max_price_avg
      cafe_avg_price_5000
      cafe_count_5000_na_price
      cafe_count_5000_price_500
      cafe_count_5000_price_1000
      cafe_count_5000_price_1500
      cafe_count_5000_price_2500
      cafe_count_5000_price_4000
      cafe_count_5000_price_high
      big_church_count_5000
      church_count_5000
      mosque_count_5000
      leisure_count_5000
      sport_count_5000
      market_count_5000
      price_doc
    
  
  
    
      0
      1
      2011-08-20
      43
      27.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Bibirevo
      6.407578e+06
      155572
      0.189727
      0.000070
      9576
      5001.0
      5.0
      10309
      11065.0
      5
      0
      240.0
      1
      0
      7
      3
      no
      0
      16
      1
      no
      no
      no
      no
      no
      no
      no
      no
      86206
      40477
      45729
      21154
      11007
      10147
      98207
      52277
      45930
      36211
      10580
      25631
      9576
      4899
      4677
      10309
      5463
      4846
      23603
      12286
      11317
      17508
      9425
      8083
      18654
      9709
      8945
      211.0
      25.0
      0.0
      0.0
      0.0
      2.0
      184.0
      0.0
      0.0
      0.0
      211.0
      0.0
      0.0
      0.0
      206.0
      5.0
      1
      2.590241
      1.131260
      13.575119
      1.131260
      0.145700
      0.177975
      2.158587
      0.600973
      1.080934
      23.683460
      1.804127
      3.633334
      5.419893
      65.038716
      1.0
      5.419893
      6.905893
      1
      0.274985
      3.299822
      0.992631
      no
      1.422391
      10.918587
      13.100618
      13.675657
      15.156211
      1.422391
      1
      no
      3.830951
      5
      1.305159
      no
      14.231961
      101
      24.292406
      1
      18.152338
      5.718519
      1.210027
      1.062513
      5.814135
      4.308127
      10.814172
      1.676258
      0.485841
      3.065047
      1.107594
      8.148591
      3.516513
      2.392353
      4.248036
      0.974743
      6.715026
      0.884350
      0.648488
      0.637189
      0.947962
      0.177975
      0.625783
      0.628187
      3.932040
      14.053047
      7.389498
      7.023705
      0.516838
      good
      0.00
      0.00
      0
      0
      0
      0
      0
      NaN
      NaN
      NaN
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
      0
      7.36
      0.00
      1
      30500
      3
      55600
      19
      527.78
      888.89
      708.33
      1
      10
      4
      3
      1
      0
      0
      1
      2
      0
      0
      6
      1
      14.27
      6.92
      3
      39554
      9
      171420
      34
      566.67
      969.70
      768.18
      1
      14
      11
      6
      2
      0
      0
      1
      2
      0
      0
      7
      1
      11.77
      15.97
      9
      188854
      19
      1244891
      36
      614.29
      1042.86
      828.57
      1
      15
      11
      6
      2
      1
      0
      1
      2
      0
      0
      10
      1
      11.98
      13.55
      12
      251554
      23
      1419204
      68
      639.68
      1079.37
      859.52
      5
      21
      22
      16
      3
      1
      0
      2
      4
      0
      0
      21
      1
      13.09
      13.31
      29
      807385
      52
      4036616
      152
      708.57
      1185.71
      947.14
      12
      39
      48
      40
      9
      4
      0
      13
      22
      1
      0
      52
      4
      5850000
    
    
      1
      2
      2011-08-23
      34
      19.0
      3.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Nagatinskij Zaton
      9.589337e+06
      115352
      0.372602
      0.049637
      6880
      3119.0
      5.0
      7759
      6237.0
      8
      0
      229.0
      1
      0
      6
      1
      yes
      1
      3
      0
      no
      no
      no
      no
      no
      no
      no
      no
      76284
      34200
      42084
      15727
      7925
      7802
      70194
      35622
      34572
      29431
      9266
      20165
      6880
      3466
      3414
      7759
      3909
      3850
      17700
      8998
      8702
      15164
      7571
      7593
      13729
      6929
      6800
      245.0
      83.0
      1.0
      0.0
      67.0
      4.0
      90.0
      0.0
      0.0
      0.0
      244.0
      1.0
      1.0
      143.0
      84.0
      15.0
      2
      0.936700
      0.647337
      7.620630
      0.635053
      0.147754
      0.273345
      0.550690
      0.065321
      0.966479
      1.317476
      4.655004
      8.648587
      3.411993
      40.943917
      2.0
      3.641773
      4.679745
      2
      0.065263
      0.783160
      0.698081
      no
      9.503405
      3.103996
      6.444333
      8.132640
      8.698054
      2.887377
      2
      no
      3.103996
      4
      0.694536
      no
      9.242586
      32
      5.706113
      2
      9.034642
      3.489954
      2.724295
      1.246149
      3.419574
      0.725560
      6.910568
      3.424716
      0.668364
      2.000154
      8.972823
      6.127073
      1.161579
      2.543747
      12.649879
      1.477723
      1.852560
      0.686252
      0.519311
      0.688796
      1.072315
      0.273345
      0.967821
      0.471447
      4.841544
      6.829889
      0.709260
      2.358840
      0.230287
      excellent
      25.14
      0.00
      0
      0
      0
      0
      5
      860.00
      1500.00
      1180.00
      0
      1
      3
      0
      0
      1
      0
      0
      1
      0
      0
      0
      0
      26.66
      0.07
      2
      86600
      5
      94065
      13
      615.38
      1076.92
      846.15
      0
      5
      6
      1
      0
      1
      0
      1
      2
      0
      4
      2
      0
      21.53
      7.71
      3
      102910
      7
      127065
      17
      694.12
      1205.88
      950.00
      0
      6
      7
      1
      2
      1
      0
      1
      5
      0
      4
      9
      0
      22.37
      19.25
      4
      165510
      8
      179065
      21
      695.24
      1190.48
      942.86
      0
      7
      8
      3
      2
      1
      0
      1
      5
      0
      4
      11
      0
      18.07
      27.32
      12
      821986
      14
      491565
      30
      631.03
      1086.21
      858.62
      1
      11
      11
      4
      2
      1
      0
      1
      7
      0
      6
      19
      1
      10.26
      27.47
      66
      2690465
      40
      2034942
      177
      673.81
      1148.81
      911.31
      9
      49
      65
      36
      15
      3
      0
      15
      29
      1
      10
      66
      14
      6000000
    
    
      2
      3
      2011-08-27
      43
      29.0
      2.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Tekstil'shhiki
      4.808270e+06
      101708
      0.112560
      0.118537
      5879
      1463.0
      4.0
      6207
      5580.0
      7
      0
      1183.0
      1
      0
      5
      1
      no
      0
      0
      1
      no
      no
      no
      yes
      no
      no
      no
      no
      101982
      46076
      55906
      13028
      6835
      6193
      63388
      31813
      31575
      25292
      7609
      17683
      5879
      3095
      2784
      6207
      3269
      2938
      14884
      7821
      7063
      19401
      9045
      10356
      11252
      5916
      5336
      330.0
      59.0
      0.0
      0.0
      206.0
      4.0
      60.0
      0.0
      1.0
      0.0
      330.0
      1.0
      0.0
      246.0
      63.0
      20.0
      3
      2.120999
      1.637996
      17.351515
      1.445960
      0.049102
      0.158072
      0.374848
      0.453172
      0.939275
      4.912660
      3.381083
      11.996480
      1.277658
      15.331896
      3.0
      1.277658
      1.701420
      3
      0.328756
      3.945073
      0.468265
      no
      5.604800
      2.927487
      6.963403
      8.054252
      9.067885
      0.647250
      3
      no
      2.927487
      4
      0.700691
      no
      9.540544
      5
      6.710302
      3
      5.777394
      7.506612
      0.772216
      1.602183
      3.682455
      3.562188
      5.752368
      1.375443
      0.733101
      1.239304
      1.978517
      0.767569
      1.952771
      0.621357
      7.682303
      0.097144
      0.841254
      1.510089
      1.486533
      1.543049
      0.391957
      0.158072
      3.178751
      0.755946
      7.922152
      4.273200
      3.156423
      4.958214
      0.190462
      poor
      1.67
      0.00
      0
      0
      0
      0
      3
      666.67
      1166.67
      916.67
      0
      0
      2
      1
      0
      0
      0
      0
      0
      0
      0
      0
      0
      4.99
      0.29
      0
      0
      0
      0
      9
      642.86
      1142.86
      892.86
      2
      0
      5
      2
      0
      0
      0
      0
      1
      0
      0
      5
      3
      9.92
      6.73
      0
      0
      1
      2600
      14
      516.67
      916.67
      716.67
      2
      4
      6
      2
      0
      0
      0
      0
      4
      0
      0
      6
      5
      12.99
      12.75
      4
      100200
      7
      52550
      24
      563.64
      977.27
      770.45
      2
      8
      9
      4
      1
      0
      0
      0
      4
      0
      0
      8
      5
      12.14
      26.46
      8
      110856
      7
      52550
      41
      697.44
      1192.31
      944.87
      2
      9
      17
      9
      3
      1
      0
      0
      11
      0
      0
      20
      6
      13.69
      21.58
      43
      1478160
      35
      1572990
      122
      702.68
      1196.43
      949.55
      10
      29
      45
      25
      10
      3
      0
      11
      27
      0
      4
      67
      10
      5700000
    
    
      3
      4
      2011-09-01
      89
      50.0
      9.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Mitino
      1.258354e+07
      178473
      0.194703
      0.069753
      13087
      6839.0
      9.0
      13670
      17063.0
      10
      0
      NaN
      1
      0
      17
      6
      no
      0
      11
      4
      no
      no
      no
      no
      no
      no
      no
      no
      21155
      9828
      11327
      28563
      14680
      13883
      120381
      60040
      60341
      29529
      9083
      20446
      13087
      6645
      6442
      13670
      7126
      6544
      32063
      16513
      15550
      3292
      1450
      1842
      24934
      12782
      12152
      458.0
      9.0
      51.0
      12.0
      124.0
      50.0
      201.0
      0.0
      9.0
      2.0
      459.0
      13.0
      24.0
      40.0
      130.0
      252.0
      4
      1.489049
      0.984537
      11.565624
      0.963802
      0.179441
      0.236455
      0.078090
      0.106125
      0.451173
      15.623710
      2.017080
      14.317640
      4.291432
      51.497190
      4.0
      3.816045
      5.271136
      4
      0.131597
      1.579164
      1.200336
      no
      2.677824
      14.606501
      17.457198
      18.309433
      19.487005
      2.677824
      1
      no
      2.780449
      17
      1.999265
      no
      17.478380
      83
      6.734618
      1
      27.667863
      9.522538
      6.348716
      1.767612
      11.178333
      0.583025
      27.892717
      0.811275
      0.623484
      1.950317
      6.483172
      7.385521
      4.923843
      3.549558
      8.789894
      2.163735
      10.903161
      0.622272
      0.599914
      0.934273
      0.892674
      0.236455
      1.031777
      1.561505
      15.300449
      16.990677
      16.041521
      5.029696
      0.465820
      good
      17.36
      0.57
      0
      0
      0
      0
      2
      1000.00
      1500.00
      1250.00
      0
      0
      0
      2
      0
      0
      0
      0
      0
      0
      0
      0
      0
      19.25
      10.35
      1
      11000
      6
      80780
      12
      658.33
      1083.33
      870.83
      0
      3
      4
      5
      0
      0
      0
      0
      0
      0
      0
      3
      1
      28.38
      6.57
      2
      11000
      7
      89492
      23
      673.91
      1130.43
      902.17
      0
      5
      9
      8
      1
      0
      0
      1
      0
      0
      0
      9
      2
      32.29
      5.73
      2
      11000
      7
      89492
      25
      660.00
      1120.00
      890.00
      0
      5
      11
      8
      1
      0
      0
      1
      1
      0
      0
      13
      2
      20.79
      3.57
      4
      167000
      12
      205756
      32
      718.75
      1218.75
      968.75
      0
      5
      14
      10
      3
      0
      0
      1
      2
      0
      0
      18
      3
      14.18
      3.89
      8
      244166
      22
      942180
      61
      931.58
      1552.63
      1242.11
      4
      7
      21
      15
      11
      2
      1
      4
      4
      0
      0
      26
      3
      13100000
    
    
      4
      5
      2011-09-05
      77
      77.0
      4.0
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      Investment
      Basmannoe
      8.398461e+06
      108171
      0.015234
      0.037316
      5706
      3240.0
      7.0
      6748
      7770.0
      9
      0
      562.0
      4
      2
      25
      2
      no
      0
      10
      93
      no
      no
      no
      yes
      yes
      no
      no
      no
      28179
      13522
      14657
      13368
      7159
      6209
      68043
      34236
      33807
      26760
      8563
      18197
      5706
      2982
      2724
      6748
      3664
      3084
      15237
      8113
      7124
      5164
      2583
      2581
      11631
      6223
      5408
      746.0
      48.0
      0.0
      0.0
      643.0
      16.0
      35.0
      0.0
      3.0
      1.0
      746.0
      371.0
      114.0
      146.0
      62.0
      53.0
      5
      1.257186
      0.876620
      8.266305
      0.688859
      0.247901
      0.376838
      0.258289
      0.236214
      0.392871
      10.683540
      2.936581
      11.903910
      0.853960
      10.247521
      5.0
      1.595898
      2.156284
      113
      0.071480
      0.857764
      0.820294
      no
      11.616653
      1.721834
      0.046810
      0.787593
      2.578671
      1.721834
      4
      no
      3.133531
      10
      0.084113
      yes
      1.595898
      113
      1.423428
      4
      6.515857
      8.671016
      1.638318
      3.632640
      4.587917
      2.609420
      9.155057
      1.969738
      0.220288
      2.544696
      3.975401
      3.610754
      0.307915
      1.864637
      3.779781
      1.121703
      0.991683
      0.892668
      0.429052
      0.077901
      0.810801
      0.376838
      0.378756
      0.121681
      2.584370
      1.112486
      1.800125
      1.339652
      0.026102
      excellent
      3.56
      4.44
      15
      293699
      1
      45000
      48
      702.22
      1166.67
      934.44
      3
      17
      10
      11
      7
      0
      0
      1
      4
      0
      2
      3
      0
      3.34
      8.29
      46
      420952
      3
      158200
      153
      763.45
      1272.41
      1017.93
      8
      39
      45
      39
      19
      2
      1
      7
      12
      0
      6
      7
      0
      4.12
      4.83
      93
      1195735
      9
      445900
      272
      766.80
      1272.73
      1019.76
      19
      70
      74
      72
      30
      6
      1
      18
      30
      0
      10
      14
      2
      4.53
      5.02
      149
      1625130
      17
      564843
      483
      765.93
      1269.23
      1017.58
      28
      130
      129
      131
      50
      14
      1
      35
      61
      0
      17
      21
      3
      5.06
      8.62
      305
      3420907
      60
      2296870
      1068
      853.03
      1410.45
      1131.74
      63
      266
      267
      262
      149
      57
      4
      70
      121
      1
      40
      77
      5
      8.38
      10.92
      689
      8404624
      114
      3503058
      2283
      853.88
      1411.45
      1132.66
      143
      566
      578
      552
      319
      108
      17
      135
      236
      2
      91
      195
      14
      16331452



In [4]:

    
def preprocess_anomaly(df):
    df["full_sq"] = map(lambda x: x if x > 10 else float("NaN"), df["full_sq"])
    df["life_sq"] = map(lambda x: x if x > 5 else float("NaN"), df["life_sq"])
    df["kitch_sq"] = map(lambda x: x if x > 2 else float("NaN"), df["kitch_sq"])
    
    # superclean
    # https://www.kaggle.com/keremt/very-extensive-cleaning-by-sberbank-discussions
    df.ix[df[df.life_sq > df.full_sq].index, "life_sq"] = np.NaN
    df.ix[df[df.kitch_sq >= df.life_sq].index, "kitch_sq"] = np.NaN

    df.ix[df[df.kitch_sq == 0].index, "kitch_sq"] = np.NaN
    df.ix[df[df.kitch_sq == 1].index, "kitch_sq"] = np.NaN

    df.ix[df[df.num_room == 0].index, "num_room"] = np.NaN
    
    df.ix[df[df.floor == 0].index, "floor"] = np.NaN
    df.ix[df[df.max_floor == 0].index, "max_floor"] = np.NaN
    
    df.ix[df[df.floor > df.max_floor].index, "max_floor"] = np.NaN
    
    df.ix[df[df.state == 33].index, "state"] = np.NaN
    
    df.ix[df[df.build_year == 20052009].index, "build_year"] = 2005
    df.ix[df[df.build_year == 20].index, "build_year"] = 2000
    df.ix[df[df.build_year == 215].index, "build_year"] = 2015

    df.ix[df[df.build_year < 1500].index, "build_year"] = np.NaN
    df.ix[df[df.build_year > 2022].index, "build_year"] = np.NaN

    return df



In [5]:

    
def preprocess_categorial(df):

    for c in list(df.columns):
        if df[c].dtype == 'object':
            lbl = sk.preprocessing.LabelEncoder()
            try:
                lbl.fit(list(train_raw[c].values) + list(test[c].values)) 
            except KeyError:
                lbl.fit(df[c].values) 
            df[c + "_le"] = lbl.transform(list(df[c].values))

#     df = mess_y_categorial(df, 5)

    df = df.select_dtypes(exclude=['object'])
    return df

def apply_categorial(test, train):
    for c in list(test.columns):
        if test[c].dtype == 'object':
            lbl = sk.preprocessing.LabelEncoder()
            try:
                lbl.fit(list(train_raw[c].values) + list(test[c].values)) 
            except KeyError:
                lbl.fit(test[c].values) 
            test[c + "_le"] = lbl.transform(list(test[c].values))

#     test = mess_y_categorial_fold(test, train)

    test = test.select_dtypes(exclude=['object'])
    return test


def smoothed_likelihood(targ_mean, nrows, globalmean, alpha=10):
    try:
        return (targ_mean * nrows + globalmean * alpha) / (nrows + alpha)
    except Exception:
        return float("NaN")


def mess_y_categorial(df, nfolds=3, alpha=10):
    from copy import copy

    folds = np.array_split(df, nfolds)
    newfolds = []
    for i in range(nfolds):
        fold = folds[i]

        other_folds = copy(folds)
        other_folds.pop(i)
        other_fold = pd.concat(other_folds)

        newfolds.append(mess_y_categorial_fold(fold, other_fold, alpha=10))

    return pd.concat(newfolds)

def mess_y_categorial_fold(fold_raw, other_fold, cols=None, y_col="price_doc", alpha=10):
    fold = fold_raw.copy()
    if not cols:
        cols = list(fold.select_dtypes(include=["object"]).columns)
    globalmean = other_fold[y_col].mean()
    for c in cols:

        target_mean = other_fold[[c, y_col]].groupby(c).mean().to_dict()[y_col]
        nrows = other_fold[c].value_counts().to_dict()

        fold[c + "_sll"] = fold[c].apply(
            lambda x: smoothed_likelihood(target_mean.get(x), nrows.get(x), globalmean, alpha) if x else float("NaN")
        )
    return fold

def feature_exclude(df):
    feats = []

    with open("greedy_search.tsv") as gs:
        for line in gs:
            row = line.strip().split("\t")
            if len(row) < 6:
                continue
            if row[5] == "remove":
                feats.append(row[0])
    if feats:
        df = df.drop(feats, axis=1)
    return df



In [6]:

    
def apply_macro(df):
    macro_cols = [
        'timestamp', "balance_trade", "balance_trade_growth", "eurrub", "average_provision_of_build_contract",
        "micex_rgbi_tr", "micex_cbi_tr", "deposits_rate", "mortgage_value", "mortgage_rate",
        "income_per_cap", "rent_price_4+room_bus", "museum_visitis_per_100_cap", "apartment_build"
    ]
    return pd.merge(df, macro, on='timestamp', how='left')



In [7]:

    
def preprocess(df):
    from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
    
#     df = apply_macro(df)

    ecology = ["no data", "poor", "satisfactory", "good", "excellent"]
    df["ecology_index"] = map(ecology.index, df["ecology"].values)

    df["age_of_building"] = df["timestamp"].apply(lambda x: x.split("-")[0]).astype(int) - df["build_year"]
    df["is_build_in_progress"] = df["age_of_building"].apply(lambda x: "yes" if x < 0 else "no")

    bool_feats = [
        "thermal_power_plant_raion",
        "incineration_raion",
        "oil_chemistry_raion",
        "radiation_raion",
        "railroad_terminal_raion",
        "big_market_raion",
        "nuclear_reactor_raion",
        "detention_facility_raion",
        "water_1line",
        "big_road1_1line",
        "railroad_1line",
        "culture_objects_top_25"
    ]
    for bf in bool_feats:
        df[bf + "_bool"] = map(lambda x: x == "yes", df[bf].values)

    df = preprocess_anomaly(df)

    df['rel_floor'] = df['floor'] / df['max_floor'].astype(float)
    df['rel_kitch_sq'] = df['kitch_sq'] / df['full_sq'].astype(float)
    df['rel_life_sq'] = df['life_sq'] / df['full_sq'].astype(float)

    df["material_cat"] = df.material.fillna(0).astype(int).astype(str).replace("0", "")
    df["state_cat"] = df.state.fillna(0).astype(int).astype(str).replace("0", "")
    df["num_room_cat"] = df.num_room.fillna(0).astype(int).astype(str).replace("0", "")
    df["build_year_cat"] = df.build_year.fillna(0).astype(int).astype(str).replace("0", "")

    df["ID_metro"] = df.ID_metro.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_railroad_station_walk"] = df.ID_railroad_station_walk.replace("", "-10").fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_railroad_station_avto"] = df.ID_railroad_station_avto.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_big_road1"] = df.ID_big_road1.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_big_road2"] = df.ID_big_road2.fillna(-10).astype(int).astype(str).replace("-10", "")
    df["ID_bus_terminal"] = df.ID_bus_terminal.fillna(-10).astype(int).astype(str).replace("-10", "")

#    # ratio of living area to full area #
#     df["ratio_life_sq_full_sq"] = df["life_sq"] / np.maximum(df["full_sq"].astype("float"),1)
#     df["ratio_life_sq_full_sq"].ix[df["ratio_life_sq_full_sq"]<0] = 0
#     df["ratio_life_sq_full_sq"].ix[df["ratio_life_sq_full_sq"]>1] = 1

#     # ratio of kitchen area to living area #
#     df["ratio_kitch_sq_life_sq"] = df["kitch_sq"] / np.maximum(df["life_sq"].astype("float"),1)
#     df["ratio_kitch_sq_life_sq"].ix[df["ratio_kitch_sq_life_sq"]<0] = 0
#     df["ratio_kitch_sq_life_sq"].ix[df["ratio_kitch_sq_life_sq"]>1] = 1

#     # ratio of kitchen area to full area #
#     df["ratio_kitch_sq_full_sq"] = df["kitch_sq"] / np.maximum(df["full_sq"].astype("float"),1)
#     df["ratio_kitch_sq_full_sq"].ix[df["ratio_kitch_sq_full_sq"]<0] = 0
#     df["ratio_kitch_sq_full_sq"].ix[df["ratio_kitch_sq_full_sq"]>1] = 1

    df = df.drop(["id", "timestamp"], axis=1)

    return df



In [8]:

    
train_pr = preprocess(train_raw)
train_pr = preprocess_categorial(train_pr)
train = feature_exclude(train_pr)
train["price_meter"] = train["price_doc"] / train["full_sq"]
# train = train.fillna(-1)

X = train.drop(["price_doc", "price_meter"], axis=1)
y = train["price_meter"].values

Обучение моделей



In [10]:

    
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X.values, y, test_size=0.20, random_state=43)

dtrain_all = xgb.DMatrix(X.values, y, feature_names=X.columns)
dtrain = xgb.DMatrix(X_train, y_train, feature_names=X.columns)
dval = xgb.DMatrix(X_val, y_val, feature_names=X.columns)



In [11]:

    
xgb_params = {
    'max_depth': 5,
    'n_estimators': 200,
    'learning_rate': 0.01,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Uncomment to tune XGB `num_boost_rounds`
model = xgb.train(xgb_params, dtrain, num_boost_round=4000, evals=[(dval, 'val')],
                  early_stopping_rounds=40, verbose_eval=40)

num_boost_round = model.best_iteration









    



[0]	val-rmse:8.28101e+06
Will train until val-rmse hasn't improved in 40 rounds.
[40]	val-rmse:5.9641e+06
[80]	val-rmse:4.52109e+06
[120]	val-rmse:3.67296e+06
[160]	val-rmse:3.19137e+06
[200]	val-rmse:2.93826e+06
[240]	val-rmse:2.80753e+06
[280]	val-rmse:2.73126e+06
[320]	val-rmse:2.69304e+06
[360]	val-rmse:2.66852e+06
[400]	val-rmse:2.65234e+06
[440]	val-rmse:2.64059e+06
[480]	val-rmse:2.62986e+06
[520]	val-rmse:2.62135e+06
[560]	val-rmse:2.615e+06
[600]	val-rmse:2.61025e+06
[640]	val-rmse:2.60486e+06
[680]	val-rmse:2.6013e+06
[720]	val-rmse:2.5986e+06
[760]	val-rmse:2.59474e+06
[800]	val-rmse:2.59211e+06
[840]	val-rmse:2.58855e+06
[880]	val-rmse:2.58689e+06
[920]	val-rmse:2.58484e+06
[960]	val-rmse:2.5832e+06
[1000]	val-rmse:2.58156e+06
[1040]	val-rmse:2.58041e+06
[1080]	val-rmse:2.57886e+06
[1120]	val-rmse:2.57719e+06
[1160]	val-rmse:2.57494e+06
[1200]	val-rmse:2.57372e+06
[1240]	val-rmse:2.5726e+06
[1280]	val-rmse:2.57168e+06
[1320]	val-rmse:2.57047e+06
[1360]	val-rmse:2.57015e+06
[1400]	val-rmse:2.56956e+06
[1440]	val-rmse:2.5693e+06
[1480]	val-rmse:2.56842e+06
[1520]	val-rmse:2.56798e+06
[1560]	val-rmse:2.56696e+06
[1600]	val-rmse:2.56633e+06
[1640]	val-rmse:2.56568e+06
[1680]	val-rmse:2.56514e+06
[1720]	val-rmse:2.56488e+06
[1760]	val-rmse:2.56442e+06
[1800]	val-rmse:2.56438e+06
[1840]	val-rmse:2.56362e+06
[1880]	val-rmse:2.56228e+06
[1920]	val-rmse:2.56186e+06
[1960]	val-rmse:2.56099e+06
[2000]	val-rmse:2.56053e+06
[2040]	val-rmse:2.56019e+06
[2080]	val-rmse:2.55984e+06
[2120]	val-rmse:2.55942e+06
[2160]	val-rmse:2.55912e+06
[2200]	val-rmse:2.55899e+06
[2240]	val-rmse:2.55863e+06
[2280]	val-rmse:2.55832e+06
Stopping. Best iteration:
[2266]	val-rmse:2.55793e+06



In [12]:

    
imp_features = pd.DataFrame(
    model.get_fscore().items(), 
    columns=['feature','importance']
).sort_values('importance', ascending=False)
imp_features









    Out[12]:







  
    
      
      feature
      importance
    
  
  
    
      88
      full_sq
      7121
    
    
      161
      life_sq
      2326
    
    
      201
      floor
      2085
    
    
      17
      max_floor
      1354
    
    
      158
      build_year
      1343
    
    
      31
      rel_life_sq
      1186
    
    
      9
      rel_kitch_sq
      1064
    
    
      202
      rel_floor
      879
    
    
      76
      kitch_sq
      732
    
    
      89
      state
      687
    
    
      102
      age_of_building
      672
    
    
      167
      kindergarten_km
      579
    
    
      135
      metro_min_avto
      552
    
    
      139
      railroad_km
      546
    
    
      75
      industrial_km
      505
    
    
      35
      material
      495
    
    
      49
      public_transport_station_min_walk
      485
    
    
      68
      area_m
      478
    
    
      5
      radiation_km
      470
    
    
      94
      mosque_km
      453
    
    
      53
      public_healthcare_km
      442
    
    
      128
      green_zone_km
      441
    
    
      197
      swim_pool_km
      428
    
    
      173
      num_room
      418
    
    
      231
      prom_part_3000
      415
    
    
      119
      university_km
      391
    
    
      30
      water_km
      388
    
    
      64
      big_church_km
      385
    
    
      204
      school_km
      367
    
    
      159
      hospital_beds_raion
      350
    
    
      65
      workplaces_km
      349
    
    
      179
      fitness_km
      329
    
    
      38
      railroad_station_walk_min
      327
    
    
      117
      prom_part_5000
      326
    
    
      200
      green_part_1000
      326
    
    
      149
      green_part_1500
      323
    
    
      74
      cafe_sum_5000_min_price_avg
      322
    
    
      175
      cemetery_km
      322
    
    
      221
      church_synagogue_km
      321
    
    
      26
      zd_vokzaly_avto_km
      314
    
    
      112
      office_km
      314
    
    
      86
      railroad_station_avto_km
      312
    
    
      61
      market_shop_km
      308
    
    
      156
      power_transmission_line_km
      303
    
    
      25
      catering_km
      300
    
    
      190
      metro_km_walk
      299
    
    
      227
      ttk_km
      297
    
    
      40
      hospice_morgue_km
      297
    
    
      55
      ice_rink_km
      291
    
    
      165
      additional_education_km
      288
    
    
      137
      detention_facility_km
      288
    
    
      208
      ts_km
      286
    
    
      205
      big_market_km
      278
    
    
      170
      big_road1_km
      275
    
    
      130
      green_part_500
      263
    
    
      176
      green_zone_part
      262
    
    
      20
      market_count_1000
      259
    
    
      182
      big_road2_km
      259
    
    
      184
      theater_km
      251
    
    
      220
      cafe_sum_1000_min_price_avg
      241
    
    
      216
      prom_part_1500
      240
    
    
      129
      indust_part
      240
    
    
      168
      oil_chemistry_km
      234
    
    
      141
      cafe_count_2000
      232
    
    
      81
      cafe_sum_500_min_price_avg
      232
    
    
      209
      ID_railroad_station_walk_le
      231
    
    
      78
      green_part_5000
      230
    
    
      77
      mkad_km
      225
    
    
      37
      park_km
      224
    
    
      143
      nuclear_reactor_km
      224
    
    
      41
      metro_km_avto
      220
    
    
      58
      basketball_km
      216
    
    
      142
      incineration_km
      211
    
    
      162
      stadium_km
      207
    
    
      146
      sadovoe_km
      198
    
    
      71
      cafe_count_500
      197
    
    
      80
      shopping_centers_km
      197
    
    
      218
      preschool_km
      195
    
    
      6
      trc_sqm_2000
      190
    
    
      169
      thermal_power_plant_km
      187
    
    
      133
      office_sqm_5000
      187
    
    
      8
      trc_sqm_3000
      186
    
    
      34
      museum_km
      177
    
    
      132
      office_sqm_2000
      176
    
    
      0
      office_sqm_1500
      174
    
    
      138
      prom_part_1000
      173
    
    
      224
      office_sqm_3000
      172
    
    
      7
      cafe_count_5000_price_2500
      169
    
    
      44
      preschool_quota
      164
    
    
      12
      cafe_count_1000
      162
    
    
      228
      product_type_le
      161
    
    
      52
      school_quota
      155
    
    
      155
      trc_sqm_5000
      150
    
    
      206
      green_part_3000
      147
    
    
      154
      green_part_2000
      146
    
    
      178
      trc_sqm_1000
      143
    
    
      144
      cafe_count_3000
      142
    
    
      103
      build_count_monolith
      142
    
    
      240
      build_count_1971-1995
      142
    
    
      212
      cafe_sum_2000_min_price_avg
      134
    
    
      29
      trc_sqm_1500
      134
    
    
      100
      prom_part_500
      130
    
    
      194
      office_count_1000
      128
    
    
      140
      full_all
      127
    
    
      84
      sport_count_3000
      126
    
    
      213
      ID_metro_le
      124
    
    
      3
      sport_count_5000
      123
    
    
      70
      big_church_count_3000
      122
    
    
      234
      cafe_count_5000_price_high
      117
    
    
      151
      cafe_sum_1500_min_price_avg
      117
    
    
      59
      cafe_count_5000
      115
    
    
      19
      church_count_1000
      115
    
    
      56
      prom_part_2000
      113
    
    
      172
      cafe_count_2000_price_2500
      111
    
    
      47
      railroad_station_avto_min
      109
    
    
      153
      cafe_sum_3000_min_price_avg
      108
    
    
      39
      sport_count_1500
      105
    
    
      66
      exhibition_km
      104
    
    
      92
      cafe_count_1000_price_high
      103
    
    
      90
      cafe_count_1500_price_500
      100
    
    
      108
      sport_objects_raion
      100
    
    
      203
      cafe_count_2000_price_1000
      99
    
    
      229
      church_count_5000
      94
    
    
      60
      sub_area_le
      94
    
    
      232
      cafe_count_1000_price_2500
      94
    
    
      33
      cafe_sum_500_max_price_avg
      92
    
    
      87
      office_sqm_1000
      92
    
    
      121
      raion_popul
      88
    
    
      27
      office_count_500
      88
    
    
      11
      build_count_brick
      86
    
    
      14
      build_count_block
      85
    
    
      123
      trc_count_2000
      85
    
    
      215
      cafe_avg_price_500
      84
    
    
      24
      market_count_3000
      82
    
    
      171
      ekder_male
      81
    
    
      239
      sport_count_1000
      81
    
    
      192
      cafe_count_5000_price_4000
      81
    
    
      23
      trc_count_5000
      80
    
    
      219
      cafe_count_1500
      79
    
    
      85
      cafe_count_3000_price_2500
      78
    
    
      225
      cafe_count_3000_price_500
      78
    
    
      166
      bulvar_ring_km
      77
    
    
      109
      preschool_education_centers_raion
      77
    
    
      237
      trc_count_1500
      75
    
    
      193
      culture_objects_top_25_raion
      71
    
    
      4
      cafe_count_500_price_1000
      70
    
    
      2
      trc_count_1000
      70
    
    
      28
      cafe_sum_1000_max_price_avg
      68
    
    
      136
      cafe_count_500_na_price
      67
    
    
      63
      cafe_count_1500_price_1500
      67
    
    
      15
      ID_big_road2_le
      67
    
    
      187
      cafe_count_1000_na_price
      63
    
    
      163
      build_count_panel
      63
    
    
      67
      office_count_2000
      63
    
    
      69
      cafe_count_1000_price_1500
      62
    
    
      98
      church_count_2000
      60
    
    
      183
      cafe_count_3000_price_4000
      60
    
    
      45
      trc_count_3000
      59
    
    
      104
      market_count_1500
      59
    
    
      116
      cafe_count_2000_price_500
      57
    
    
      101
      cafe_count_1500_price_2500
      57
    
    
      105
      cafe_avg_price_2000
      56
    
    
      217
      cafe_count_5000_na_price
      56
    
    
      36
      16_29_all
      56
    
    
      127
      cafe_count_1500_price_4000
      55
    
    
      191
      cafe_sum_3000_max_price_avg
      54
    
    
      113
      office_count_1500
      53
    
    
      199
      big_church_count_1500
      53
    
    
      150
      build_count_before_1920
      53
    
    
      145
      sport_count_2000
      52
    
    
      236
      cafe_sum_2000_max_price_avg
      52
    
    
      122
      cafe_count_5000_price_1000
      52
    
    
      185
      sport_count_500
      51
    
    
      50
      cafe_count_1000_price_1000
      51
    
    
      13
      big_church_count_2000
      48
    
    
      111
      cafe_count_5000_price_1500
      45
    
    
      1
      cafe_count_3000_na_price
      45
    
    
      96
      cafe_count_1500_price_1000
      45
    
    
      189
      raion_build_count_with_material_info
      44
    
    
      147
      cafe_count_3000_price_high
      44
    
    
      57
      trc_sqm_500
      43
    
    
      177
      school_education_centers_top_20_raion
      42
    
    
      181
      office_count_3000
      42
    
    
      214
      ID_big_road1_le
      41
    
    
      233
      leisure_count_3000
      41
    
    
      73
      cafe_sum_1500_max_price_avg
      39
    
    
      48
      female_f
      38
    
    
      10
      work_all
      36
    
    
      238
      thermal_power_plant_raion_le
      35
    
    
      32
      cafe_count_2000_price_1500
      34
    
    
      93
      shopping_centers_raion
      33
    
    
      126
      cafe_count_1000_price_500
      31
    
    
      196
      cafe_count_1500_na_price
      31
    
    
      46
      cafe_count_2000_price_4000
      31
    
    
      164
      ecology_le
      30
    
    
      97
      cafe_avg_price_1500
      30
    
    
      188
      leisure_count_5000
      28
    
    
      22
      cafe_count_2000_na_price
      28
    
    
      120
      children_preschool
      27
    
    
      42
      kremlin_km
      27
    
    
      99
      church_count_3000
      26
    
    
      207
      build_count_1921-1945
      26
    
    
      186
      build_count_slag
      26
    
    
      195
      big_church_count_1000
      26
    
    
      235
      cafe_count_5000_price_500
      25
    
    
      114
      office_raion
      24
    
    
      174
      cafe_count_2000_price_high
      24
    
    
      230
      cafe_count_500_price_500
      24
    
    
      125
      healthcare_centers_raion
      24
    
    
      134
      build_count_after_1995
      24
    
    
      131
      ekder_all
      21
    
    
      83
      mosque_count_5000
      21
    
    
      95
      leisure_count_1000
      20
    
    
      152
      big_church_count_5000
      19
    
    
      110
      young_all
      18
    
    
      16
      cafe_count_1500_price_high
      17
    
    
      198
      trc_count_500
      16
    
    
      62
      cafe_avg_price_1000
      16
    
    
      106
      market_count_2000
      14
    
    
      43
      university_top_20_raion
      14
    
    
      118
      leisure_count_2000
      13
    
    
      160
      ekder_female
      12
    
    
      124
      num_room_cat_le
      11
    
    
      180
      16_29_female
      11
    
    
      115
      ID_railroad_terminal
      11
    
    
      82
      ID_bus_terminal_le
      10
    
    
      223
      male_f
      9
    
    
      222
      railroad_terminal_raion_le
      8
    
    
      226
      build_count_frame
      8
    
    
      51
      leisure_count_500
      8
    
    
      21
      cafe_avg_price_3000
      7
    
    
      211
      build_count_wood
      6
    
    
      157
      16_29_male
      6
    
    
      79
      7_14_all
      6
    
    
      210
      cafe_count_500_price_high
      5
    
    
      72
      church_count_500
      5
    
    
      148
      leisure_count_1500
      4
    
    
      91
      7_14_female
      4
    
    
      54
      big_church_count_500
      4
    
    
      107
      build_count_mix
      2
    
    
      18
      7_14_male
      1



In [13]:

    
cv_output = xgb.cv(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round, verbose_eval=40)
cv_output[['train-rmse-mean', 'test-rmse-mean']].plot()









    



[0]	train-rmse:8.41649e+06+41741.5	test-rmse:8.41749e+06+84540.3
[40]	train-rmse:6.03322e+06+28199.8	test-rmse:6.0902e+06+71143.3
[80]	train-rmse:4.52591e+06+24294.8	test-rmse:4.65295e+06+63603
[120]	train-rmse:3.59694e+06+24673.6	test-rmse:3.799e+06+61408.8
[160]	train-rmse:3.03773e+06+23604.7	test-rmse:3.31908e+06+62720.6
[200]	train-rmse:2.70512e+06+20565.3	test-rmse:3.05728e+06+62195
[240]	train-rmse:2.50918e+06+21138.6	test-rmse:2.91332e+06+61563.8
[280]	train-rmse:2.3916e+06+20671.9	test-rmse:2.83512e+06+61156.3
[320]	train-rmse:2.3135e+06+20144.3	test-rmse:2.79042e+06+59205.5
[360]	train-rmse:2.25861e+06+19201.1	test-rmse:2.7633e+06+58681.9
[400]	train-rmse:2.21503e+06+19435.8	test-rmse:2.74456e+06+57977.8
[440]	train-rmse:2.18046e+06+19601.1	test-rmse:2.73086e+06+57811.9
[480]	train-rmse:2.15196e+06+19933	test-rmse:2.72007e+06+58000.3
[520]	train-rmse:2.12786e+06+19380.8	test-rmse:2.71168e+06+57381.1
[560]	train-rmse:2.10665e+06+19583.9	test-rmse:2.70399e+06+57641.7
[600]	train-rmse:2.08826e+06+19258.9	test-rmse:2.69753e+06+58358.9
[640]	train-rmse:2.07184e+06+19879.9	test-rmse:2.69279e+06+58757.2
[680]	train-rmse:2.05778e+06+20663.5	test-rmse:2.68847e+06+59238.1
[720]	train-rmse:2.04465e+06+21183.1	test-rmse:2.68437e+06+59468.3
[760]	train-rmse:2.03197e+06+22094.4	test-rmse:2.68174e+06+59738.9
[800]	train-rmse:2.01994e+06+22527.7	test-rmse:2.6796e+06+60355
[840]	train-rmse:2.00752e+06+23233.5	test-rmse:2.6776e+06+61028.3
[880]	train-rmse:1.99688e+06+23938	test-rmse:2.67548e+06+61020.5
[920]	train-rmse:1.98564e+06+24815.2	test-rmse:2.67365e+06+61223.1
[960]	train-rmse:1.97503e+06+25357.7	test-rmse:2.67183e+06+61828.3
[1000]	train-rmse:1.96528e+06+25780.7	test-rmse:2.66991e+06+61944.8
[1040]	train-rmse:1.95524e+06+26162.3	test-rmse:2.66828e+06+61943.5
[1080]	train-rmse:1.94577e+06+26766.3	test-rmse:2.66691e+06+62111.4
[1120]	train-rmse:1.93633e+06+27168.4	test-rmse:2.66531e+06+62610.9
[1160]	train-rmse:1.92735e+06+27697.8	test-rmse:2.66396e+06+62549.7
[1200]	train-rmse:1.91923e+06+28150.4	test-rmse:2.66314e+06+62591.8
[1240]	train-rmse:1.91119e+06+27365.7	test-rmse:2.66202e+06+62661.6
[1280]	train-rmse:1.90317e+06+27745.6	test-rmse:2.66117e+06+62582.7
[1320]	train-rmse:1.89484e+06+28139.7	test-rmse:2.66045e+06+62849.1
[1360]	train-rmse:1.88751e+06+28285.6	test-rmse:2.65967e+06+62919.8
[1400]	train-rmse:1.87949e+06+28329.9	test-rmse:2.65891e+06+63000.5
[1440]	train-rmse:1.87197e+06+28878.6	test-rmse:2.65814e+06+63109.3
[1480]	train-rmse:1.86484e+06+29134.8	test-rmse:2.65735e+06+63265.3
[1520]	train-rmse:1.85755e+06+29508.2	test-rmse:2.65652e+06+63467.6
[1560]	train-rmse:1.85066e+06+29164.2	test-rmse:2.65604e+06+63503.9
[1600]	train-rmse:1.84309e+06+29768.5	test-rmse:2.65539e+06+63461
[1640]	train-rmse:1.83616e+06+30119.4	test-rmse:2.65523e+06+63711.9
[1680]	train-rmse:1.82954e+06+29955.3	test-rmse:2.65466e+06+63606.9
[1720]	train-rmse:1.82287e+06+29298.2	test-rmse:2.65411e+06+63726.8
[1760]	train-rmse:1.81641e+06+29519.6	test-rmse:2.65359e+06+63735.6
[1800]	train-rmse:1.80996e+06+30001.5	test-rmse:2.65317e+06+63935.4
[1840]	train-rmse:1.80373e+06+30423.3	test-rmse:2.65293e+06+63938.3
[1880]	train-rmse:1.79744e+06+30126.6	test-rmse:2.65244e+06+63925.3
[1920]	train-rmse:1.79116e+06+29301.7	test-rmse:2.65217e+06+64012.4
[1960]	train-rmse:1.78525e+06+29547.1	test-rmse:2.65183e+06+63809.9
[2000]	train-rmse:1.77881e+06+28807.9	test-rmse:2.65152e+06+63886.1
[2040]	train-rmse:1.77238e+06+29150	test-rmse:2.65144e+06+63912.2
[2080]	train-rmse:1.76645e+06+29024.9	test-rmse:2.65132e+06+64081.5
[2120]	train-rmse:1.76011e+06+29154.1	test-rmse:2.65117e+06+64255.5
[2160]	train-rmse:1.75358e+06+29313.6	test-rmse:2.65090e+06+64653
[2200]	train-rmse:1.74676e+06+28968.2	test-rmse:2.65044e+06+64838.8
[2240]	train-rmse:1.74066e+06+28727.3	test-rmse:2.65025e+06+64969.5






    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x1120a7410>



In [14]:

    
model = xgb.train(dict(xgb_params, silent=0), dtrain_all, num_boost_round=num_boost_round, verbose_eval=40)
print "predict-train:", rmse(model.predict(dtrain_all), y)









    



predict-train: 1881896.66663

Submission



In [15]:

    
test_pr = preprocess(test)
train_pr = preprocess(train_raw)
test_pr = apply_categorial(test_pr, train_pr)
test_pr = feature_exclude(test_pr)
# test_pr = test_pr.fillna(-1)c

dtest = xgb.DMatrix(test_pr.values, feature_names=test_pr.columns)
y_pred = model.predict(dtest)

# y_pred = model.predict(test_pr.values)

# y_pred = np.exp(y_pred) - 1

submdf = pd.DataFrame({"id": test["id"], "price_doc": y_pred})
submdf.to_csv("data/submission.csv", header=True, index=False)
!head data/submission.csv









    



id,price_doc
30474,5568339.0
30475,8274571.5
30476,5637723.5
30477,5850102.5
30478,5368623.0
30479,8247314.5
30480,4413024.0
30481,3841126.5
30482,4725561.5

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro + other, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude:

val-rmse:2.55793e+06
train-rmse:1.74066e+06+28727.3  test-rmse:2.65025e+06+64969.5
predict-train: 1881896.66663
kaggle: 0.31344

val-rmse:2.54654e+06
train-rmse:1.74594e+06+24020    test-rmse:2.66053e+06+67300.3
predict-train: 1883352.60935
kaggle: 0.31364

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:

val-rmse:2.55613e+06
train-rmse:1.74466e+06+27385.6  test-rmse:2.66422e+06+69734.1
predict-train: 1888051.35357
kaggle: 0.31366

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro with other ID, ratio feats, no superfeatures, Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle, feature_exclude 143:

val-rmse:2.58557e+06
train-rmse:1.98509e+06+26803.7  test-rmse:2.68755e+06+59691.1
predict-train: 2092731.29028
kaggle: 0.31731

#

5*200, no macro, add rel features, no log price, train_without_noise:

val-rmse:2.63772e+06
train-rmse:1.9989e+06+10986.4   test-rmse:2.69158e+06+53020
predict-train: 2076010.27131
kaggle: 0.31720

5*200, no macro, add rel features, no log price, train_with_noise:

val-rmse:2.53378e+06
train-rmse:1.95069e+06+16166.4  test-rmse:2.69703e+06+61455.1
predict-train: 2054421.59869
kaggle: 0.32056

5*200, macro, add rel features, no log price, train_without_noise:

val-rmse:2.79632e+06
train-rmse:1.81015e+06+19781.2  test-rmse:2.6641e+06+123875
predict-train: 1904063.27368
kaggle: 0.32976

5*200, no macro, add rel features, no log price, train_without_noise:

val-rmse:2.61682e+06
train-rmse:1.81123e+06+27681.2  test-rmse:2.66923e+06+53925.7
predict-train: 1899129.43771
kaggle: 0.31592

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.61055e+06
train-rmse:1.71826e+06+30076.1  test-rmse:2.66515e+06+54583.5
predict-train: 1814572.97424
kaggle: 0.31602

7*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.59955e+06
train-rmse:1.41393e+06+21208.1  test-rmse:2.6763e+06+35553.3
predict-train: 1548257.49121
kaggle: 0.31768

4*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.63407e+06
train-rmse:1.96513e+06+21470.8  test-rmse:2.69417e+06+74288.3
predict-train: 2062299.41091
kaggle: 0.31952

7*200, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.59955e+06
train-rmse:1.41393e+06+21208.1  test-rmse:2.6763e+06+35553.3
predict-train: 1548257.49121

5*300, no macro, add rel features, no log price, train_without_noise, 4000 iter:

val-rmse:2.61055e+06
train-rmse:1.71826e+06+30076.1  test-rmse:2.66515e+06+54583.5
predict-train: 1814572.97424

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna:

val-rmse:2.61664e+06
train-rmse:1.77892e+06+23111    test-rmse:2.65829e+06+56398.6
predict-train: 1875799.54634
kaggle: 0.31521

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean:

val-rmse:2.6265e+06
train-rmse:1.78478e+06+22545.4  test-rmse:2.66179e+06+60626.3
predict-train: 1881672.27588
kaggle: 0.31476

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, no super features + Label Encoding:

val-rmse:2.56494e+06
train-rmse:1.78862e+06+18589.1  test-rmse:2.69283e+06+79861.4
predict-train: 1923466.41923
kaggle: 0.31434

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, remove material state num_room:

val-rmse:2.56932e+06
train-rmse:1.88495e+06+20133.7  test-rmse:2.69624e+06+70491.2
predict-train: 1979198.19201
kaggle: 0.31513

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro/bus...:

val-rmse:2.60017e+06
train-rmse:1.80654e+06+19453.5  test-rmse:2.68203e+06+68169.5
predict-train: 1906439.98603
kaggle: 0.31927

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, remove 50 features:

val-rmse:2.93665e+06
train-rmse:1.73425e+06+19462.4  test-rmse:2.68682e+06+140661
predict-train: 1861268.6455
kaggle: 0.31555

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, remove 50 features, add ratio feats:

val-rmse:2.59747e+06
train-rmse:1.75828e+06+26639.4  test-rmse:2.68491e+06+67201.8
predict-train: 1875707.6581
kaggle: 0.31760

5*200, no macro, add rel features, no log price, train_without_noise, 4000 iter, not fillna, superclean, ID metro, no ratio feats, superfeatures + Label Encoding, is_build_in_progress + age_of_building, kfold wo shuffle:

val-rmse:2.5419e+06
train-rmse:1.74381e+06+22710.7  test-rmse:2.65787e+06+66889.9
predict-train: 1862467.67153
kaggle: 0.31716

val-rmse:2.5676e+06
train-rmse:1.81485e+06+24274    test-rmse:2.67324e+06+60153.1
predict-train: 1947645.83102
kaggle: 0.31376

Feature Greedy selection



In [45]:

    
from tqdm import tqdm
def get_best_score(train):
    xgb_params = {
        'max_depth': 5,
        'n_estimators': 200,
        'learning_rate': 0.01,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }
    cvres = xgb.cv(xgb_params, train, num_boost_round=4000, early_stopping_rounds=40)
    return cvres["test-rmse-mean"].min(), cvres["test-rmse-mean"].argmin()

def df2DMatrix(df):
    return xgb.DMatrix(data=df.drop("price_doc", axis=1).values, label=df["price_doc"].values)

def greedy_remove_features(df, feature_importances):
    train = df
    with open("greedy_search.tsv", "a") as f:
        best_score, iterno = get_best_score(df2DMatrix(df))
        f.write("\t".join(["INITIAL", str(best_score), str(iterno)]) + "\n")
        to_analyze = sorted(feature_importances.items(), key=lambda x: x[1])
        for feat, feat_importance in tqdm(to_analyze):
            f.flush()
            candidate_train = train.drop(feat, axis=1)
            cand_best_score, iterno = get_best_score(df2DMatrix(candidate_train))

            if cand_best_score > best_score:
                # стало хуже, оставляем фичу
                f.write("\t".join([feat, str(cand_best_score), str(best_score), str(feat_importance), str(iterno), "skip"]) + "\n")
                f.flush()
                continue

            f.write("\t".join([feat, str(cand_best_score), str(best_score), str(feat_importance), str(iterno), "remove"]) + "\n")
            best_score = cand_best_score
            train = candidate_train



In [47]:

    
feature_importances = imp_features.set_index("feature").to_dict()["importance"]

train_gs = train
with open("greedy_search.tsv") as gs:
    for line in gs:
        row = line.strip().split("\t")
        if len(row) < 6:
            continue
        if row[5] == "remove":
            try:
                train_gs = train_gs.drop(row[0], axis=1)
            except ValueError:
                pass
            print "drop", row[0]
        feature_importances.pop(row[0], None)

greedy_remove_features(train_gs, feature_importances)









    



drop 0_6_female
drop young_female
drop market_count_500
drop cafe_count_500_price_4000
drop nuclear_reactor_raion_bool
drop work_male
drop radiation_raion_bool
drop 0_13_female
drop detention_facility_raion_bool
drop thermal_power_plant_raion_bool
drop work_female
drop ecology_index
drop 0_17_male
drop railroad_terminal_raion_bool
drop church_count_1500
drop big_road1_1line_bool
drop additional_education_raion
drop cafe_count_1000_price_4000
drop cafe_count_3000_price_1500
drop office_count_5000
drop children_school
drop cafe_avg_price_5000
drop build_count_1946-1970
drop school_education_centers_raion
drop build_count_foam
drop market_count_5000
drop cafe_count_3000_price_1000
drop cafe_sum_5000_max_price_avg
drop cafe_count_500_price_2500
drop cafe_count_500_price_1500






    



100%|██████████| 123/123 [20:40:45<00:00, 613.14s/it]



In [ ]:

	id	timestamp	full_sq	life_sq	floor	max_floor	material	build_year	num_room	kitch_sq	state	product_type	sub_area	area_m	raion_popul	green_zone_part	indust_part	children_preschool	preschool_quota	preschool_education_centers_raion	children_school	school_quota	school_education_centers_raion	hospital_beds_raion	healthcare_centers_raion	university_top_20_raion	sport_objects_raion	additional_education_raion	culture_objects_top_25	culture_objects_top_25_raion	shopping_centers_raion	office_raion	thermal_power_plant_raion	incineration_raion	oil_chemistry_raion	radiation_raion	railroad_terminal_raion	big_market_raion	nuclear_reactor_raion	detention_facility_raion	full_all	male_f	female_f	young_all	young_male	young_female	work_all	work_male	work_female	ekder_all	ekder_male	ekder_female	0_6_all	0_6_male	0_6_female	7_14_all	7_14_male	7_14_female	0_17_all	0_17_male	0_17_female	16_29_all	16_29_male	16_29_female	0_13_all	0_13_male	0_13_female	raion_build_count_with_material_info	build_count_block	build_count_wood	build_count_frame	build_count_brick	build_count_monolith	build_count_panel	build_count_slag	build_count_mix	raion_build_count_with_builddate_info	build_count_before_1920	build_count_1921-1945	build_count_1946-1970	build_count_1971-1995	build_count_after_1995	ID_metro	metro_min_avto	metro_km_avto	metro_min_walk	metro_km_walk	kindergarten_km	school_km	park_km	green_zone_km	industrial_km	water_treatment_km	cemetery_km	incineration_km	railroad_station_walk_km	railroad_station_walk_min	ID_railroad_station_walk	railroad_station_avto_km	railroad_station_avto_min	ID_railroad_station_avto	public_transport_station_km	public_transport_station_min_walk	water_km	water_1line	mkad_km	ttk_km	sadovoe_km	bulvar_ring_km	kremlin_km	big_road1_km	ID_big_road1	big_road1_1line	big_road2_km	ID_big_road2	railroad_km	railroad_1line	zd_vokzaly_avto_km	ID_railroad_terminal	bus_terminal_avto_km	ID_bus_terminal	oil_chemistry_km	nuclear_reactor_km	radiation_km	power_transmission_line_km	thermal_power_plant_km	ts_km	big_market_km	market_shop_km	fitness_km	swim_pool_km	ice_rink_km	stadium_km	basketball_km	hospice_morgue_km	detention_facility_km	public_healthcare_km	university_km	workplaces_km	shopping_centers_km	office_km	additional_education_km	preschool_km	big_church_km	church_synagogue_km	mosque_km	theater_km	museum_km	exhibition_km	catering_km	ecology	green_part_500	prom_part_500	office_count_500	office_sqm_500	trc_count_500	trc_sqm_500	cafe_count_500	cafe_sum_500_min_price_avg	cafe_sum_500_max_price_avg	cafe_avg_price_500	cafe_count_500_na_price	cafe_count_500_price_500	cafe_count_500_price_1000	cafe_count_500_price_1500	cafe_count_500_price_2500	cafe_count_500_price_4000	big_church_count_500	church_count_500	leisure_count_500	sport_count_500	green_part_1000	prom_part_1000	office_count_1000	office_sqm_1000	trc_count_1000	trc_sqm_1000	cafe_count_1000	cafe_sum_1000_min_price_avg	cafe_sum_1000_max_price_avg	cafe_avg_price_1000	cafe_count_1000_na_price	cafe_count_1000_price_500	cafe_count_1000_price_1000	cafe_count_1000_price_1500	cafe_count_1000_price_2500	cafe_count_1000_price_4000	cafe_count_1000_price_high	big_church_count_1000	church_count_1000	leisure_count_1000	sport_count_1000	market_count_1000	green_part_1500	prom_part_1500	office_count_1500	office_sqm_1500	trc_count_1500	trc_sqm_1500	cafe_count_1500	cafe_sum_1500_min_price_avg	cafe_sum_1500_max_price_avg	cafe_avg_price_1500	cafe_count_1500_na_price	cafe_count_1500_price_500	cafe_count_1500_price_1000	cafe_count_1500_price_1500	cafe_count_1500_price_2500	cafe_count_1500_price_4000	cafe_count_1500_price_high	big_church_count_1500	church_count_1500	leisure_count_1500	sport_count_1500	market_count_1500	green_part_2000	prom_part_2000	office_count_2000	office_sqm_2000	trc_count_2000	trc_sqm_2000	cafe_count_2000	cafe_sum_2000_min_price_avg	cafe_sum_2000_max_price_avg	cafe_avg_price_2000	cafe_count_2000_na_price	cafe_count_2000_price_500	cafe_count_2000_price_1000	cafe_count_2000_price_1500	cafe_count_2000_price_2500	cafe_count_2000_price_4000	cafe_count_2000_price_high	big_church_count_2000	church_count_2000	leisure_count_2000	sport_count_2000	market_count_2000	green_part_3000	prom_part_3000	office_count_3000	office_sqm_3000	trc_count_3000	trc_sqm_3000	cafe_count_3000	cafe_sum_3000_min_price_avg	cafe_sum_3000_max_price_avg	cafe_avg_price_3000	cafe_count_3000_na_price	cafe_count_3000_price_500	cafe_count_3000_price_1000	cafe_count_3000_price_1500	cafe_count_3000_price_2500	cafe_count_3000_price_4000	cafe_count_3000_price_high	big_church_count_3000	church_count_3000	mosque_count_3000	leisure_count_3000	sport_count_3000	market_count_3000	green_part_5000	prom_part_5000	office_count_5000	office_sqm_5000	trc_count_5000	trc_sqm_5000	cafe_count_5000	cafe_sum_5000_min_price_avg	cafe_sum_5000_max_price_avg	cafe_avg_price_5000	cafe_count_5000_na_price	cafe_count_5000_price_500	cafe_count_5000_price_1000	cafe_count_5000_price_1500	cafe_count_5000_price_2500	cafe_count_5000_price_4000	cafe_count_5000_price_high	big_church_count_5000	church_count_5000	mosque_count_5000	leisure_count_5000	sport_count_5000	market_count_5000	price_doc
0	1	2011-08-20	43	27.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Bibirevo	6.407578e+06	155572	0.189727	0.000070	9576	5001.0	5.0	10309	11065.0	5	240.0	1	0	7	3	no	0	16	1	no	no	no	no	no	no	no	no	86206	40477	45729	21154	11007	10147	98207	52277	45930	36211	10580	25631	9576	4899	4677	10309	5463	4846	23603	12286	11317	17508	9425	8083	18654	9709	8945	211.0	25.0	0.0	0.0	0.0	2.0	184.0	0.0	0.0	211.0	0.0	0.0	0.0	206.0	5.0	1	2.590241	1.131260	13.575119	1.131260	0.145700	0.177975	2.158587	0.600973	1.080934	23.683460	1.804127	3.633334	5.419893	65.038716	1.0	5.419893	6.905893	1	0.274985	3.299822	0.992631	no	1.422391	10.918587	13.100618	13.675657	15.156211	1.422391	1	no	3.830951	5	1.305159	no	14.231961	101	24.292406	1	18.152338	5.718519	1.210027	1.062513	5.814135	4.308127	10.814172	1.676258	0.485841	3.065047	1.107594	8.148591	3.516513	2.392353	4.248036	0.974743	6.715026	0.884350	0.648488	0.637189	0.947962	0.177975	0.625783	0.628187	3.932040	14.053047	7.389498	7.023705	0.516838	good	0.00	0.00	0	0	0	0	0	NaN	NaN	NaN	0	0	0	0	0	0	0	0	0	1	7.36	0.00	1	30500	3	55600	19	527.78	888.89	708.33	1	10	4	3	1	0	0	1	2	0	6	1	14.27	6.92	3	39554	9	171420	34	566.67	969.70	768.18	1	14	11	6	2	0	0	1	2	0	7	1	11.77	15.97	9	188854	19	1244891	36	614.29	1042.86	828.57	1	15	11	6	2	1	0	1	2	0	10	1	11.98	13.55	12	251554	23	1419204	68	639.68	1079.37	859.52	5	21	22	16	3	1	0	2	4	0	0	21	1	13.09	13.31	29	807385	52	4036616	152	708.57	1185.71	947.14	12	39	48	40	9	4	0	13	22	1	0	52	4	5850000
1	2	2011-08-23	34	19.0	3.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Nagatinskij Zaton	9.589337e+06	115352	0.372602	0.049637	6880	3119.0	5.0	7759	6237.0	8	229.0	1	0	6	1	yes	1	3	0	no	no	no	no	no	no	no	no	76284	34200	42084	15727	7925	7802	70194	35622	34572	29431	9266	20165	6880	3466	3414	7759	3909	3850	17700	8998	8702	15164	7571	7593	13729	6929	6800	245.0	83.0	1.0	0.0	67.0	4.0	90.0	0.0	0.0	244.0	1.0	1.0	143.0	84.0	15.0	2	0.936700	0.647337	7.620630	0.635053	0.147754	0.273345	0.550690	0.065321	0.966479	1.317476	4.655004	8.648587	3.411993	40.943917	2.0	3.641773	4.679745	2	0.065263	0.783160	0.698081	no	9.503405	3.103996	6.444333	8.132640	8.698054	2.887377	2	no	3.103996	4	0.694536	no	9.242586	32	5.706113	2	9.034642	3.489954	2.724295	1.246149	3.419574	0.725560	6.910568	3.424716	0.668364	2.000154	8.972823	6.127073	1.161579	2.543747	12.649879	1.477723	1.852560	0.686252	0.519311	0.688796	1.072315	0.273345	0.967821	0.471447	4.841544	6.829889	0.709260	2.358840	0.230287	excellent	25.14	0.00	0	0	0	0	5	860.00	1500.00	1180.00	0	1	3	0	0	1	0	1	0	0	26.66	0.07	2	86600	5	94065	13	615.38	1076.92	846.15	0	5	6	1	0	1	0	1	2	4	2	0	21.53	7.71	3	102910	7	127065	17	694.12	1205.88	950.00	0	6	7	1	2	1	0	1	5	4	9	0	22.37	19.25	4	165510	8	179065	21	695.24	1190.48	942.86	0	7	8	3	2	1	0	1	5	4	11	0	18.07	27.32	12	821986	14	491565	30	631.03	1086.21	858.62	1	11	11	4	2	1	0	1	7	0	6	19	1	10.26	27.47	66	2690465	40	2034942	177	673.81	1148.81	911.31	9	49	65	36	15	3	0	15	29	1	10	66	14	6000000
2	3	2011-08-27	43	29.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Tekstil'shhiki	4.808270e+06	101708	0.112560	0.118537	5879	1463.0	4.0	6207	5580.0	7	1183.0	1	0	5	1	no	0	0	1	no	no	no	yes	no	no	no	no	101982	46076	55906	13028	6835	6193	63388	31813	31575	25292	7609	17683	5879	3095	2784	6207	3269	2938	14884	7821	7063	19401	9045	10356	11252	5916	5336	330.0	59.0	0.0	0.0	206.0	4.0	60.0	1.0	0.0	330.0	1.0	0.0	246.0	63.0	20.0	3	2.120999	1.637996	17.351515	1.445960	0.049102	0.158072	0.374848	0.453172	0.939275	4.912660	3.381083	11.996480	1.277658	15.331896	3.0	1.277658	1.701420	3	0.328756	3.945073	0.468265	no	5.604800	2.927487	6.963403	8.054252	9.067885	0.647250	3	no	2.927487	4	0.700691	no	9.540544	5	6.710302	3	5.777394	7.506612	0.772216	1.602183	3.682455	3.562188	5.752368	1.375443	0.733101	1.239304	1.978517	0.767569	1.952771	0.621357	7.682303	0.097144	0.841254	1.510089	1.486533	1.543049	0.391957	0.158072	3.178751	0.755946	7.922152	4.273200	3.156423	4.958214	0.190462	poor	1.67	0.00	0	0	0	0	3	666.67	1166.67	916.67	0	0	2	1	0	0	0	0	0	0	4.99	0.29	0	0	0	0	9	642.86	1142.86	892.86	2	0	5	2	0	0	0	0	1	0	5	3	9.92	6.73	0	0	1	2600	14	516.67	916.67	716.67	2	4	6	2	0	0	0	0	4	0	6	5	12.99	12.75	4	100200	7	52550	24	563.64	977.27	770.45	2	8	9	4	1	0	0	0	4	0	8	5	12.14	26.46	8	110856	7	52550	41	697.44	1192.31	944.87	2	9	17	9	3	1	0	0	11	0	0	20	6	13.69	21.58	43	1478160	35	1572990	122	702.68	1196.43	949.55	10	29	45	25	10	3	0	11	27	0	4	67	10	5700000
3	4	2011-09-01	89	50.0	9.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Mitino	1.258354e+07	178473	0.194703	0.069753	13087	6839.0	9.0	13670	17063.0	10	NaN	1	0	17	6	no	0	11	4	no	no	no	no	no	no	no	no	21155	9828	11327	28563	14680	13883	120381	60040	60341	29529	9083	20446	13087	6645	6442	13670	7126	6544	32063	16513	15550	3292	1450	1842	24934	12782	12152	458.0	9.0	51.0	12.0	124.0	50.0	201.0	9.0	2.0	459.0	13.0	24.0	40.0	130.0	252.0	4	1.489049	0.984537	11.565624	0.963802	0.179441	0.236455	0.078090	0.106125	0.451173	15.623710	2.017080	14.317640	4.291432	51.497190	4.0	3.816045	5.271136	4	0.131597	1.579164	1.200336	no	2.677824	14.606501	17.457198	18.309433	19.487005	2.677824	1	no	2.780449	17	1.999265	no	17.478380	83	6.734618	1	27.667863	9.522538	6.348716	1.767612	11.178333	0.583025	27.892717	0.811275	0.623484	1.950317	6.483172	7.385521	4.923843	3.549558	8.789894	2.163735	10.903161	0.622272	0.599914	0.934273	0.892674	0.236455	1.031777	1.561505	15.300449	16.990677	16.041521	5.029696	0.465820	good	17.36	0.57	0	0	0	0	2	1000.00	1500.00	1250.00	0	0	0	2	0	0	0	0	0	0	19.25	10.35	1	11000	6	80780	12	658.33	1083.33	870.83	0	3	4	5	0	0	0	0	0	0	3	1	28.38	6.57	2	11000	7	89492	23	673.91	1130.43	902.17	0	5	9	8	1	0	0	1	0	0	9	2	32.29	5.73	2	11000	7	89492	25	660.00	1120.00	890.00	0	5	11	8	1	0	0	1	1	0	13	2	20.79	3.57	4	167000	12	205756	32	718.75	1218.75	968.75	0	5	14	10	3	0	0	1	2	0	0	18	3	14.18	3.89	8	244166	22	942180	61	931.58	1552.63	1242.11	4	7	21	15	11	2	1	4	4	0	0	26	3	13100000
4	5	2011-09-05	77	77.0	4.0	NaN	NaN	NaN	NaN	NaN	NaN	Investment	Basmannoe	8.398461e+06	108171	0.015234	0.037316	5706	3240.0	7.0	6748	7770.0	9	562.0	4	2	25	2	no	0	10	93	no	no	no	yes	yes	no	no	no	28179	13522	14657	13368	7159	6209	68043	34236	33807	26760	8563	18197	5706	2982	2724	6748	3664	3084	15237	8113	7124	5164	2583	2581	11631	6223	5408	746.0	48.0	0.0	0.0	643.0	16.0	35.0	3.0	1.0	746.0	371.0	114.0	146.0	62.0	53.0	5	1.257186	0.876620	8.266305	0.688859	0.247901	0.376838	0.258289	0.236214	0.392871	10.683540	2.936581	11.903910	0.853960	10.247521	5.0	1.595898	2.156284	113	0.071480	0.857764	0.820294	no	11.616653	1.721834	0.046810	0.787593	2.578671	1.721834	4	no	3.133531	10	0.084113	yes	1.595898	113	1.423428	4	6.515857	8.671016	1.638318	3.632640	4.587917	2.609420	9.155057	1.969738	0.220288	2.544696	3.975401	3.610754	0.307915	1.864637	3.779781	1.121703	0.991683	0.892668	0.429052	0.077901	0.810801	0.376838	0.378756	0.121681	2.584370	1.112486	1.800125	1.339652	0.026102	excellent	3.56	4.44	15	293699	1	45000	48	702.22	1166.67	934.44	3	17	10	11	7	0	1	4	2	3	3.34	8.29	46	420952	3	158200	153	763.45	1272.41	1017.93	8	39	45	39	19	2	1	7	12	6	7	0	4.12	4.83	93	1195735	9	445900	272	766.80	1272.73	1019.76	19	70	74	72	30	6	1	18	30	10	14	2	4.53	5.02	149	1625130	17	564843	483	765.93	1269.23	1017.58	28	130	129	131	50	14	1	35	61	17	21	3	5.06	8.62	305	3420907	60	2296870	1068	853.03	1410.45	1131.74	63	266	267	262	149	57	4	70	121	1	40	77	5	8.38	10.92	689	8404624	114	3503058	2283	853.88	1411.45	1132.66	143	566	578	552	319	108	17	135	236	2	91	195	14	16331452

	feature	importance
88	full_sq	7121
161	life_sq	2326
201	floor	2085
17	max_floor	1354
158	build_year	1343
31	rel_life_sq	1186
9	rel_kitch_sq	1064
202	rel_floor	879
76	kitch_sq	732
89	state	687
102	age_of_building	672
167	kindergarten_km	579
135	metro_min_avto	552
139	railroad_km	546
75	industrial_km	505
35	material	495
49	public_transport_station_min_walk	485
68	area_m	478
5	radiation_km	470
94	mosque_km	453
53	public_healthcare_km	442
128	green_zone_km	441
197	swim_pool_km	428
173	num_room	418
231	prom_part_3000	415
119	university_km	391
30	water_km	388
64	big_church_km	385
204	school_km	367
159	hospital_beds_raion	350
65	workplaces_km	349
179	fitness_km	329
38	railroad_station_walk_min	327
117	prom_part_5000	326
200	green_part_1000	326
149	green_part_1500	323
74	cafe_sum_5000_min_price_avg	322
175	cemetery_km	322
221	church_synagogue_km	321
26	zd_vokzaly_avto_km	314
112	office_km	314
86	railroad_station_avto_km	312
61	market_shop_km	308
156	power_transmission_line_km	303
25	catering_km	300
190	metro_km_walk	299
227	ttk_km	297
40	hospice_morgue_km	297
55	ice_rink_km	291
165	additional_education_km	288
137	detention_facility_km	288
208	ts_km	286
205	big_market_km	278
170	big_road1_km	275
130	green_part_500	263
176	green_zone_part	262
20	market_count_1000	259
182	big_road2_km	259
184	theater_km	251
220	cafe_sum_1000_min_price_avg	241
216	prom_part_1500	240
129	indust_part	240
168	oil_chemistry_km	234
141	cafe_count_2000	232
81	cafe_sum_500_min_price_avg	232
209	ID_railroad_station_walk_le	231
78	green_part_5000	230
77	mkad_km	225
37	park_km	224
143	nuclear_reactor_km	224
41	metro_km_avto	220
58	basketball_km	216
142	incineration_km	211
162	stadium_km	207
146	sadovoe_km	198
71	cafe_count_500	197
80	shopping_centers_km	197
218	preschool_km	195
6	trc_sqm_2000	190
169	thermal_power_plant_km	187
133	office_sqm_5000	187
8	trc_sqm_3000	186
34	museum_km	177
132	office_sqm_2000	176
0	office_sqm_1500	174
138	prom_part_1000	173
224	office_sqm_3000	172
7	cafe_count_5000_price_2500	169
44	preschool_quota	164
12	cafe_count_1000	162
228	product_type_le	161
52	school_quota	155
155	trc_sqm_5000	150
206	green_part_3000	147
154	green_part_2000	146
178	trc_sqm_1000	143
144	cafe_count_3000	142
103	build_count_monolith	142
240	build_count_1971-1995	142
212	cafe_sum_2000_min_price_avg	134
29	trc_sqm_1500	134
100	prom_part_500	130
194	office_count_1000	128
140	full_all	127
84	sport_count_3000	126
213	ID_metro_le	124
3	sport_count_5000	123
70	big_church_count_3000	122
234	cafe_count_5000_price_high	117
151	cafe_sum_1500_min_price_avg	117
59	cafe_count_5000	115
19	church_count_1000	115
56	prom_part_2000	113
172	cafe_count_2000_price_2500	111
47	railroad_station_avto_min	109
153	cafe_sum_3000_min_price_avg	108
39	sport_count_1500	105
66	exhibition_km	104
92	cafe_count_1000_price_high	103
90	cafe_count_1500_price_500	100
108	sport_objects_raion	100
203	cafe_count_2000_price_1000	99
229	church_count_5000	94
60	sub_area_le	94
232	cafe_count_1000_price_2500	94
33	cafe_sum_500_max_price_avg	92
87	office_sqm_1000	92
121	raion_popul	88
27	office_count_500	88
11	build_count_brick	86
14	build_count_block	85
123	trc_count_2000	85
215	cafe_avg_price_500	84
24	market_count_3000	82
171	ekder_male	81
239	sport_count_1000	81
192	cafe_count_5000_price_4000	81
23	trc_count_5000	80
219	cafe_count_1500	79
85	cafe_count_3000_price_2500	78
225	cafe_count_3000_price_500	78
166	bulvar_ring_km	77
109	preschool_education_centers_raion	77
237	trc_count_1500	75
193	culture_objects_top_25_raion	71
4	cafe_count_500_price_1000	70
2	trc_count_1000	70
28	cafe_sum_1000_max_price_avg	68
136	cafe_count_500_na_price	67
63	cafe_count_1500_price_1500	67
15	ID_big_road2_le	67
187	cafe_count_1000_na_price	63
163	build_count_panel	63
67	office_count_2000	63
69	cafe_count_1000_price_1500	62
98	church_count_2000	60
183	cafe_count_3000_price_4000	60
45	trc_count_3000	59
104	market_count_1500	59
116	cafe_count_2000_price_500	57
101	cafe_count_1500_price_2500	57
105	cafe_avg_price_2000	56
217	cafe_count_5000_na_price	56
36	16_29_all	56
127	cafe_count_1500_price_4000	55
191	cafe_sum_3000_max_price_avg	54
113	office_count_1500	53
199	big_church_count_1500	53
150	build_count_before_1920	53
145	sport_count_2000	52
236	cafe_sum_2000_max_price_avg	52
122	cafe_count_5000_price_1000	52
185	sport_count_500	51
50	cafe_count_1000_price_1000	51
13	big_church_count_2000	48
111	cafe_count_5000_price_1500	45
1	cafe_count_3000_na_price	45
96	cafe_count_1500_price_1000	45
189	raion_build_count_with_material_info	44
147	cafe_count_3000_price_high	44
57	trc_sqm_500	43
177	school_education_centers_top_20_raion	42
181	office_count_3000	42
214	ID_big_road1_le	41
233	leisure_count_3000	41
73	cafe_sum_1500_max_price_avg	39
48	female_f	38
10	work_all	36
238	thermal_power_plant_raion_le	35
32	cafe_count_2000_price_1500	34
93	shopping_centers_raion	33
126	cafe_count_1000_price_500	31
196	cafe_count_1500_na_price	31
46	cafe_count_2000_price_4000	31
164	ecology_le	30
97	cafe_avg_price_1500	30
188	leisure_count_5000	28
22	cafe_count_2000_na_price	28
120	children_preschool	27
42	kremlin_km	27
99	church_count_3000	26
207	build_count_1921-1945	26
186	build_count_slag	26
195	big_church_count_1000	26
235	cafe_count_5000_price_500	25
114	office_raion	24
174	cafe_count_2000_price_high	24
230	cafe_count_500_price_500	24
125	healthcare_centers_raion	24
134	build_count_after_1995	24
131	ekder_all	21
83	mosque_count_5000	21
95	leisure_count_1000	20
152	big_church_count_5000	19
110	young_all	18
16	cafe_count_1500_price_high	17
198	trc_count_500	16
62	cafe_avg_price_1000	16
106	market_count_2000	14
43	university_top_20_raion	14
118	leisure_count_2000	13
160	ekder_female	12
124	num_room_cat_le	11
180	16_29_female	11
115	ID_railroad_terminal	11
82	ID_bus_terminal_le	10
223	male_f	9
222	railroad_terminal_raion_le	8
226	build_count_frame	8
51	leisure_count_500	8
21	cafe_avg_price_3000	7
211	build_count_wood	6
157	16_29_male	6
79	7_14_all	6
210	cafe_count_500_price_high	5
72	church_count_500	5
148	leisure_count_1500	4
91	7_14_female	4
54	big_church_count_500	4
107	build_count_mix	2
18	7_14_male	1