In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

plt.style.use('ggplot')
pd.set_option('display.max_columns', 300)
pd.set_option('display.float_format', '{:20,.2f}'.format)

In [46]:
train_df = pd.read_csv('../../data/raw/train.csv', 
                 index_col=0, 
                 parse_dates=[1], 
                 infer_datetime_format=True)

In [47]:
train_df.head()


Out[47]:
timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state product_type sub_area area_m raion_popul green_zone_part indust_part children_preschool preschool_quota preschool_education_centers_raion children_school school_quota school_education_centers_raion school_education_centers_top_20_raion hospital_beds_raion healthcare_centers_raion university_top_20_raion sport_objects_raion additional_education_raion culture_objects_top_25 culture_objects_top_25_raion shopping_centers_raion office_raion thermal_power_plant_raion incineration_raion oil_chemistry_raion radiation_raion railroad_terminal_raion big_market_raion nuclear_reactor_raion detention_facility_raion full_all male_f female_f young_all young_male young_female work_all work_male work_female ekder_all ekder_male ekder_female 0_6_all 0_6_male 0_6_female 7_14_all 7_14_male 7_14_female 0_17_all 0_17_male 0_17_female 16_29_all 16_29_male 16_29_female 0_13_all 0_13_male 0_13_female raion_build_count_with_material_info build_count_block build_count_wood build_count_frame build_count_brick build_count_monolith build_count_panel build_count_foam build_count_slag build_count_mix raion_build_count_with_builddate_info build_count_before_1920 build_count_1921-1945 build_count_1946-1970 build_count_1971-1995 build_count_after_1995 ID_metro metro_min_avto metro_km_avto metro_min_walk metro_km_walk kindergarten_km school_km park_km green_zone_km industrial_km water_treatment_km cemetery_km incineration_km railroad_station_walk_km railroad_station_walk_min ID_railroad_station_walk railroad_station_avto_km railroad_station_avto_min ID_railroad_station_avto public_transport_station_km public_transport_station_min_walk water_km water_1line mkad_km ttk_km sadovoe_km bulvar_ring_km kremlin_km big_road1_km ID_big_road1 big_road1_1line big_road2_km ID_big_road2 railroad_km railroad_1line zd_vokzaly_avto_km ID_railroad_terminal bus_terminal_avto_km ID_bus_terminal oil_chemistry_km nuclear_reactor_km radiation_km power_transmission_line_km thermal_power_plant_km ts_km big_market_km market_shop_km fitness_km swim_pool_km ice_rink_km stadium_km basketball_km hospice_morgue_km detention_facility_km public_healthcare_km university_km workplaces_km shopping_centers_km office_km additional_education_km preschool_km big_church_km church_synagogue_km mosque_km theater_km museum_km exhibition_km catering_km ecology green_part_500 prom_part_500 office_count_500 office_sqm_500 trc_count_500 trc_sqm_500 cafe_count_500 cafe_sum_500_min_price_avg cafe_sum_500_max_price_avg cafe_avg_price_500 cafe_count_500_na_price cafe_count_500_price_500 cafe_count_500_price_1000 cafe_count_500_price_1500 cafe_count_500_price_2500 cafe_count_500_price_4000 cafe_count_500_price_high big_church_count_500 church_count_500 mosque_count_500 leisure_count_500 sport_count_500 market_count_500 green_part_1000 prom_part_1000 office_count_1000 office_sqm_1000 trc_count_1000 trc_sqm_1000 cafe_count_1000 cafe_sum_1000_min_price_avg cafe_sum_1000_max_price_avg cafe_avg_price_1000 cafe_count_1000_na_price cafe_count_1000_price_500 cafe_count_1000_price_1000 cafe_count_1000_price_1500 cafe_count_1000_price_2500 cafe_count_1000_price_4000 cafe_count_1000_price_high big_church_count_1000 church_count_1000 mosque_count_1000 leisure_count_1000 sport_count_1000 market_count_1000 green_part_1500 prom_part_1500 office_count_1500 office_sqm_1500 trc_count_1500 trc_sqm_1500 cafe_count_1500 cafe_sum_1500_min_price_avg cafe_sum_1500_max_price_avg cafe_avg_price_1500 cafe_count_1500_na_price cafe_count_1500_price_500 cafe_count_1500_price_1000 cafe_count_1500_price_1500 cafe_count_1500_price_2500 cafe_count_1500_price_4000 cafe_count_1500_price_high big_church_count_1500 church_count_1500 mosque_count_1500 leisure_count_1500 sport_count_1500 market_count_1500 green_part_2000 prom_part_2000 office_count_2000 office_sqm_2000 trc_count_2000 trc_sqm_2000 cafe_count_2000 cafe_sum_2000_min_price_avg cafe_sum_2000_max_price_avg cafe_avg_price_2000 cafe_count_2000_na_price cafe_count_2000_price_500 cafe_count_2000_price_1000 cafe_count_2000_price_1500 cafe_count_2000_price_2500 cafe_count_2000_price_4000 cafe_count_2000_price_high big_church_count_2000 church_count_2000 mosque_count_2000 leisure_count_2000 sport_count_2000 market_count_2000 green_part_3000 prom_part_3000 office_count_3000 office_sqm_3000 trc_count_3000 trc_sqm_3000 cafe_count_3000 cafe_sum_3000_min_price_avg cafe_sum_3000_max_price_avg cafe_avg_price_3000 cafe_count_3000_na_price cafe_count_3000_price_500 cafe_count_3000_price_1000 cafe_count_3000_price_1500 cafe_count_3000_price_2500 cafe_count_3000_price_4000 cafe_count_3000_price_high big_church_count_3000 church_count_3000 mosque_count_3000 leisure_count_3000 sport_count_3000 market_count_3000 green_part_5000 prom_part_5000 office_count_5000 office_sqm_5000 trc_count_5000 trc_sqm_5000 cafe_count_5000 cafe_sum_5000_min_price_avg cafe_sum_5000_max_price_avg cafe_avg_price_5000 cafe_count_5000_na_price cafe_count_5000_price_500 cafe_count_5000_price_1000 cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
id
1 2011-08-20 43 27.00 4.00 nan nan nan nan nan nan Investment Bibirevo 6,407,578.10 155572 0.19 0.00 9576 5,001.00 5 10309 11,065.00 5 0 240.00 1 0 7 3 no 0 16 1 no no no no no no no no 86206 40477 45729 21154 11007 10147 98207 52277 45930 36211 10580 25631 9576 4899 4677 10309 5463 4846 23603 12286 11317 17508 9425 8083 18654 9709 8945 211.00 25.00 0.00 0.00 0.00 2.00 184.00 0.00 0.00 0.00 211.00 0.00 0.00 0.00 206.00 5.00 1 2.59 1.13 13.58 1.13 0.15 0.18 2.16 0.60 1.08 23.68 1.80 3.63 5.42 65.04 1.00 5.42 6.91 1 0.27 3.30 0.99 no 1.42 10.92 13.10 13.68 15.16 1.42 1 no 3.83 5 1.31 no 14.23 101 24.29 1 18.15 5.72 1.21 1.06 5.81 4.31 10.81 1.68 0.49 3.07 1.11 8.15 3.52 2.39 4.25 0.97 6.72 0.88 0.65 0.64 0.95 0.18 0.63 0.63 3.93 14.05 7.39 7.02 0.52 good 0.00 0.00 0 0 0 0 0 nan nan nan 0 0 0 0 0 0 0 0 0 0 0 1 0 7.36 0.00 1 30500 3 55600 19 527.78 888.89 708.33 1 10 4 3 1 0 0 1 2 0 0 6 1 14.27 6.92 3 39554 9 171420 34 566.67 969.70 768.18 1 14 11 6 2 0 0 1 2 0 0 7 1 11.77 15.97 9 188854 19 1244891 36 614.29 1,042.86 828.57 1 15 11 6 2 1 0 1 2 0 0 10 1 11.98 13.55 12 251554 23 1419204 68 639.68 1,079.37 859.52 5 21 22 16 3 1 0 2 4 0 0 21 1 13.09 13.31 29 807385 52 4036616 152 708.57 1,185.71 947.14 12 39 48 40 9 4 0 13 22 1 0 52 4 5850000
2 2011-08-23 34 19.00 3.00 nan nan nan nan nan nan Investment Nagatinskij Zaton 9,589,336.91 115352 0.37 0.05 6880 3,119.00 5 7759 6,237.00 8 0 229.00 1 0 6 1 yes 1 3 0 no no no no no no no no 76284 34200 42084 15727 7925 7802 70194 35622 34572 29431 9266 20165 6880 3466 3414 7759 3909 3850 17700 8998 8702 15164 7571 7593 13729 6929 6800 245.00 83.00 1.00 0.00 67.00 4.00 90.00 0.00 0.00 0.00 244.00 1.00 1.00 143.00 84.00 15.00 2 0.94 0.65 7.62 0.64 0.15 0.27 0.55 0.07 0.97 1.32 4.66 8.65 3.41 40.94 2.00 3.64 4.68 2 0.07 0.78 0.70 no 9.50 3.10 6.44 8.13 8.70 2.89 2 no 3.10 4 0.69 no 9.24 32 5.71 2 9.03 3.49 2.72 1.25 3.42 0.73 6.91 3.42 0.67 2.00 8.97 6.13 1.16 2.54 12.65 1.48 1.85 0.69 0.52 0.69 1.07 0.27 0.97 0.47 4.84 6.83 0.71 2.36 0.23 excellent 25.14 0.00 0 0 0 0 5 860.00 1,500.00 1,180.00 0 1 3 0 0 1 0 0 1 0 0 0 0 26.66 0.07 2 86600 5 94065 13 615.38 1,076.92 846.15 0 5 6 1 0 1 0 1 2 0 4 2 0 21.53 7.71 3 102910 7 127065 17 694.12 1,205.88 950.00 0 6 7 1 2 1 0 1 5 0 4 9 0 22.37 19.25 4 165510 8 179065 21 695.24 1,190.48 942.86 0 7 8 3 2 1 0 1 5 0 4 11 0 18.07 27.32 12 821986 14 491565 30 631.03 1,086.21 858.62 1 11 11 4 2 1 0 1 7 0 6 19 1 10.26 27.47 66 2690465 40 2034942 177 673.81 1,148.81 911.31 9 49 65 36 15 3 0 15 29 1 10 66 14 6000000
3 2011-08-27 43 29.00 2.00 nan nan nan nan nan nan Investment Tekstil'shhiki 4,808,269.83 101708 0.11 0.12 5879 1,463.00 4 6207 5,580.00 7 0 1,183.00 1 0 5 1 no 0 0 1 no no no yes no no no no 101982 46076 55906 13028 6835 6193 63388 31813 31575 25292 7609 17683 5879 3095 2784 6207 3269 2938 14884 7821 7063 19401 9045 10356 11252 5916 5336 330.00 59.00 0.00 0.00 206.00 4.00 60.00 0.00 1.00 0.00 330.00 1.00 0.00 246.00 63.00 20.00 3 2.12 1.64 17.35 1.45 0.05 0.16 0.37 0.45 0.94 4.91 3.38 12.00 1.28 15.33 3.00 1.28 1.70 3 0.33 3.95 0.47 no 5.60 2.93 6.96 8.05 9.07 0.65 3 no 2.93 4 0.70 no 9.54 5 6.71 3 5.78 7.51 0.77 1.60 3.68 3.56 5.75 1.38 0.73 1.24 1.98 0.77 1.95 0.62 7.68 0.10 0.84 1.51 1.49 1.54 0.39 0.16 3.18 0.76 7.92 4.27 3.16 4.96 0.19 poor 1.67 0.00 0 0 0 0 3 666.67 1,166.67 916.67 0 0 2 1 0 0 0 0 0 0 0 0 0 4.99 0.29 0 0 0 0 9 642.86 1,142.86 892.86 2 0 5 2 0 0 0 0 1 0 0 5 3 9.92 6.73 0 0 1 2600 14 516.67 916.67 716.67 2 4 6 2 0 0 0 0 4 0 0 6 5 12.99 12.75 4 100200 7 52550 24 563.64 977.27 770.45 2 8 9 4 1 0 0 0 4 0 0 8 5 12.14 26.46 8 110856 7 52550 41 697.44 1,192.31 944.87 2 9 17 9 3 1 0 0 11 0 0 20 6 13.69 21.58 43 1478160 35 1572990 122 702.68 1,196.43 949.55 10 29 45 25 10 3 0 11 27 0 4 67 10 5700000
4 2011-09-01 89 50.00 9.00 nan nan nan nan nan nan Investment Mitino 12,583,535.69 178473 0.19 0.07 13087 6,839.00 9 13670 17,063.00 10 0 nan 1 0 17 6 no 0 11 4 no no no no no no no no 21155 9828 11327 28563 14680 13883 120381 60040 60341 29529 9083 20446 13087 6645 6442 13670 7126 6544 32063 16513 15550 3292 1450 1842 24934 12782 12152 458.00 9.00 51.00 12.00 124.00 50.00 201.00 0.00 9.00 2.00 459.00 13.00 24.00 40.00 130.00 252.00 4 1.49 0.98 11.57 0.96 0.18 0.24 0.08 0.11 0.45 15.62 2.02 14.32 4.29 51.50 4.00 3.82 5.27 4 0.13 1.58 1.20 no 2.68 14.61 17.46 18.31 19.49 2.68 1 no 2.78 17 2.00 no 17.48 83 6.73 1 27.67 9.52 6.35 1.77 11.18 0.58 27.89 0.81 0.62 1.95 6.48 7.39 4.92 3.55 8.79 2.16 10.90 0.62 0.60 0.93 0.89 0.24 1.03 1.56 15.30 16.99 16.04 5.03 0.47 good 17.36 0.57 0 0 0 0 2 1,000.00 1,500.00 1,250.00 0 0 0 2 0 0 0 0 0 0 0 0 0 19.25 10.35 1 11000 6 80780 12 658.33 1,083.33 870.83 0 3 4 5 0 0 0 0 0 0 0 3 1 28.38 6.57 2 11000 7 89492 23 673.91 1,130.43 902.17 0 5 9 8 1 0 0 1 0 0 0 9 2 32.29 5.73 2 11000 7 89492 25 660.00 1,120.00 890.00 0 5 11 8 1 0 0 1 1 0 0 13 2 20.79 3.57 4 167000 12 205756 32 718.75 1,218.75 968.75 0 5 14 10 3 0 0 1 2 0 0 18 3 14.18 3.89 8 244166 22 942180 61 931.58 1,552.63 1,242.11 4 7 21 15 11 2 1 4 4 0 0 26 3 13100000
5 2011-09-05 77 77.00 4.00 nan nan nan nan nan nan Investment Basmannoe 8,398,460.62 108171 0.02 0.04 5706 3,240.00 7 6748 7,770.00 9 0 562.00 4 2 25 2 no 0 10 93 no no no yes yes no no no 28179 13522 14657 13368 7159 6209 68043 34236 33807 26760 8563 18197 5706 2982 2724 6748 3664 3084 15237 8113 7124 5164 2583 2581 11631 6223 5408 746.00 48.00 0.00 0.00 643.00 16.00 35.00 0.00 3.00 1.00 746.00 371.00 114.00 146.00 62.00 53.00 5 1.26 0.88 8.27 0.69 0.25 0.38 0.26 0.24 0.39 10.68 2.94 11.90 0.85 10.25 5.00 1.60 2.16 113 0.07 0.86 0.82 no 11.62 1.72 0.05 0.79 2.58 1.72 4 no 3.13 10 0.08 yes 1.60 113 1.42 4 6.52 8.67 1.64 3.63 4.59 2.61 9.16 1.97 0.22 2.54 3.98 3.61 0.31 1.86 3.78 1.12 0.99 0.89 0.43 0.08 0.81 0.38 0.38 0.12 2.58 1.11 1.80 1.34 0.03 excellent 3.56 4.44 15 293699 1 45000 48 702.22 1,166.67 934.44 3 17 10 11 7 0 0 1 4 0 2 3 0 3.34 8.29 46 420952 3 158200 153 763.45 1,272.41 1,017.93 8 39 45 39 19 2 1 7 12 0 6 7 0 4.12 4.83 93 1195735 9 445900 272 766.80 1,272.73 1,019.76 19 70 74 72 30 6 1 18 30 0 10 14 2 4.53 5.02 149 1625130 17 564843 483 765.93 1,269.23 1,017.58 28 130 129 131 50 14 1 35 61 0 17 21 3 5.06 8.62 305 3420907 60 2296870 1068 853.03 1,410.45 1,131.74 63 266 267 262 149 57 4 70 121 1 40 77 5 8.38 10.92 689 8404624 114 3503058 2283 853.88 1,411.45 1,132.66 143 566 578 552 319 108 17 135 236 2 91 195 14 16331452

In [48]:
fig, axs = plt.subplots(ncols=2)

sns.distplot(train_df['price_doc'], ax=axs[0])
sns.distplot(np.log(train_df['price_doc']), ax=axs[1])
plt.show()



In [49]:
fig, axs = plt.subplots(ncols=2)

fs = train_df['full_sq']

sns.distplot(fs[fs > 0], ax=axs[0])
sns.distplot(np.log(fs[fs > 0]), ax=axs[1])
plt.show()



In [50]:
train_df['price_doc'].describe()


Out[50]:
count              30,471.00
mean            7,123,035.28
std             4,780,111.33
min               100,000.00
25%             4,740,002.00
50%             6,274,411.00
75%             8,300,000.00
max           111,111,112.00
Name: price_doc, dtype: float64

In [75]:
train_df['year_month'] = train_df['timestamp'].apply(lambda x: str(x.year) + '-' + str(x.month).zfill(2))

In [ ]: