In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
plt.style.use('ggplot')
pd.set_option('display.max_columns', 300)
pd.set_option('display.float_format', '{:20,.2f}'.format)
In [46]:
train_df = pd.read_csv('../../data/raw/train.csv',
index_col=0,
parse_dates=[1],
infer_datetime_format=True)
In [47]:
train_df.head()
Out[47]:
timestamp
full_sq
life_sq
floor
max_floor
material
build_year
num_room
kitch_sq
state
product_type
sub_area
area_m
raion_popul
green_zone_part
indust_part
children_preschool
preschool_quota
preschool_education_centers_raion
children_school
school_quota
school_education_centers_raion
school_education_centers_top_20_raion
hospital_beds_raion
healthcare_centers_raion
university_top_20_raion
sport_objects_raion
additional_education_raion
culture_objects_top_25
culture_objects_top_25_raion
shopping_centers_raion
office_raion
thermal_power_plant_raion
incineration_raion
oil_chemistry_raion
radiation_raion
railroad_terminal_raion
big_market_raion
nuclear_reactor_raion
detention_facility_raion
full_all
male_f
female_f
young_all
young_male
young_female
work_all
work_male
work_female
ekder_all
ekder_male
ekder_female
0_6_all
0_6_male
0_6_female
7_14_all
7_14_male
7_14_female
0_17_all
0_17_male
0_17_female
16_29_all
16_29_male
16_29_female
0_13_all
0_13_male
0_13_female
raion_build_count_with_material_info
build_count_block
build_count_wood
build_count_frame
build_count_brick
build_count_monolith
build_count_panel
build_count_foam
build_count_slag
build_count_mix
raion_build_count_with_builddate_info
build_count_before_1920
build_count_1921-1945
build_count_1946-1970
build_count_1971-1995
build_count_after_1995
ID_metro
metro_min_avto
metro_km_avto
metro_min_walk
metro_km_walk
kindergarten_km
school_km
park_km
green_zone_km
industrial_km
water_treatment_km
cemetery_km
incineration_km
railroad_station_walk_km
railroad_station_walk_min
ID_railroad_station_walk
railroad_station_avto_km
railroad_station_avto_min
ID_railroad_station_avto
public_transport_station_km
public_transport_station_min_walk
water_km
water_1line
mkad_km
ttk_km
sadovoe_km
bulvar_ring_km
kremlin_km
big_road1_km
ID_big_road1
big_road1_1line
big_road2_km
ID_big_road2
railroad_km
railroad_1line
zd_vokzaly_avto_km
ID_railroad_terminal
bus_terminal_avto_km
ID_bus_terminal
oil_chemistry_km
nuclear_reactor_km
radiation_km
power_transmission_line_km
thermal_power_plant_km
ts_km
big_market_km
market_shop_km
fitness_km
swim_pool_km
ice_rink_km
stadium_km
basketball_km
hospice_morgue_km
detention_facility_km
public_healthcare_km
university_km
workplaces_km
shopping_centers_km
office_km
additional_education_km
preschool_km
big_church_km
church_synagogue_km
mosque_km
theater_km
museum_km
exhibition_km
catering_km
ecology
green_part_500
prom_part_500
office_count_500
office_sqm_500
trc_count_500
trc_sqm_500
cafe_count_500
cafe_sum_500_min_price_avg
cafe_sum_500_max_price_avg
cafe_avg_price_500
cafe_count_500_na_price
cafe_count_500_price_500
cafe_count_500_price_1000
cafe_count_500_price_1500
cafe_count_500_price_2500
cafe_count_500_price_4000
cafe_count_500_price_high
big_church_count_500
church_count_500
mosque_count_500
leisure_count_500
sport_count_500
market_count_500
green_part_1000
prom_part_1000
office_count_1000
office_sqm_1000
trc_count_1000
trc_sqm_1000
cafe_count_1000
cafe_sum_1000_min_price_avg
cafe_sum_1000_max_price_avg
cafe_avg_price_1000
cafe_count_1000_na_price
cafe_count_1000_price_500
cafe_count_1000_price_1000
cafe_count_1000_price_1500
cafe_count_1000_price_2500
cafe_count_1000_price_4000
cafe_count_1000_price_high
big_church_count_1000
church_count_1000
mosque_count_1000
leisure_count_1000
sport_count_1000
market_count_1000
green_part_1500
prom_part_1500
office_count_1500
office_sqm_1500
trc_count_1500
trc_sqm_1500
cafe_count_1500
cafe_sum_1500_min_price_avg
cafe_sum_1500_max_price_avg
cafe_avg_price_1500
cafe_count_1500_na_price
cafe_count_1500_price_500
cafe_count_1500_price_1000
cafe_count_1500_price_1500
cafe_count_1500_price_2500
cafe_count_1500_price_4000
cafe_count_1500_price_high
big_church_count_1500
church_count_1500
mosque_count_1500
leisure_count_1500
sport_count_1500
market_count_1500
green_part_2000
prom_part_2000
office_count_2000
office_sqm_2000
trc_count_2000
trc_sqm_2000
cafe_count_2000
cafe_sum_2000_min_price_avg
cafe_sum_2000_max_price_avg
cafe_avg_price_2000
cafe_count_2000_na_price
cafe_count_2000_price_500
cafe_count_2000_price_1000
cafe_count_2000_price_1500
cafe_count_2000_price_2500
cafe_count_2000_price_4000
cafe_count_2000_price_high
big_church_count_2000
church_count_2000
mosque_count_2000
leisure_count_2000
sport_count_2000
market_count_2000
green_part_3000
prom_part_3000
office_count_3000
office_sqm_3000
trc_count_3000
trc_sqm_3000
cafe_count_3000
cafe_sum_3000_min_price_avg
cafe_sum_3000_max_price_avg
cafe_avg_price_3000
cafe_count_3000_na_price
cafe_count_3000_price_500
cafe_count_3000_price_1000
cafe_count_3000_price_1500
cafe_count_3000_price_2500
cafe_count_3000_price_4000
cafe_count_3000_price_high
big_church_count_3000
church_count_3000
mosque_count_3000
leisure_count_3000
sport_count_3000
market_count_3000
green_part_5000
prom_part_5000
office_count_5000
office_sqm_5000
trc_count_5000
trc_sqm_5000
cafe_count_5000
cafe_sum_5000_min_price_avg
cafe_sum_5000_max_price_avg
cafe_avg_price_5000
cafe_count_5000_na_price
cafe_count_5000_price_500
cafe_count_5000_price_1000
cafe_count_5000_price_1500
cafe_count_5000_price_2500
cafe_count_5000_price_4000
cafe_count_5000_price_high
big_church_count_5000
church_count_5000
mosque_count_5000
leisure_count_5000
sport_count_5000
market_count_5000
price_doc
id
1
2011-08-20
43
27.00
4.00
nan
nan
nan
nan
nan
nan
Investment
Bibirevo
6,407,578.10
155572
0.19
0.00
9576
5,001.00
5
10309
11,065.00
5
0
240.00
1
0
7
3
no
0
16
1
no
no
no
no
no
no
no
no
86206
40477
45729
21154
11007
10147
98207
52277
45930
36211
10580
25631
9576
4899
4677
10309
5463
4846
23603
12286
11317
17508
9425
8083
18654
9709
8945
211.00
25.00
0.00
0.00
0.00
2.00
184.00
0.00
0.00
0.00
211.00
0.00
0.00
0.00
206.00
5.00
1
2.59
1.13
13.58
1.13
0.15
0.18
2.16
0.60
1.08
23.68
1.80
3.63
5.42
65.04
1.00
5.42
6.91
1
0.27
3.30
0.99
no
1.42
10.92
13.10
13.68
15.16
1.42
1
no
3.83
5
1.31
no
14.23
101
24.29
1
18.15
5.72
1.21
1.06
5.81
4.31
10.81
1.68
0.49
3.07
1.11
8.15
3.52
2.39
4.25
0.97
6.72
0.88
0.65
0.64
0.95
0.18
0.63
0.63
3.93
14.05
7.39
7.02
0.52
good
0.00
0.00
0
0
0
0
0
nan
nan
nan
0
0
0
0
0
0
0
0
0
0
0
1
0
7.36
0.00
1
30500
3
55600
19
527.78
888.89
708.33
1
10
4
3
1
0
0
1
2
0
0
6
1
14.27
6.92
3
39554
9
171420
34
566.67
969.70
768.18
1
14
11
6
2
0
0
1
2
0
0
7
1
11.77
15.97
9
188854
19
1244891
36
614.29
1,042.86
828.57
1
15
11
6
2
1
0
1
2
0
0
10
1
11.98
13.55
12
251554
23
1419204
68
639.68
1,079.37
859.52
5
21
22
16
3
1
0
2
4
0
0
21
1
13.09
13.31
29
807385
52
4036616
152
708.57
1,185.71
947.14
12
39
48
40
9
4
0
13
22
1
0
52
4
5850000
2
2011-08-23
34
19.00
3.00
nan
nan
nan
nan
nan
nan
Investment
Nagatinskij Zaton
9,589,336.91
115352
0.37
0.05
6880
3,119.00
5
7759
6,237.00
8
0
229.00
1
0
6
1
yes
1
3
0
no
no
no
no
no
no
no
no
76284
34200
42084
15727
7925
7802
70194
35622
34572
29431
9266
20165
6880
3466
3414
7759
3909
3850
17700
8998
8702
15164
7571
7593
13729
6929
6800
245.00
83.00
1.00
0.00
67.00
4.00
90.00
0.00
0.00
0.00
244.00
1.00
1.00
143.00
84.00
15.00
2
0.94
0.65
7.62
0.64
0.15
0.27
0.55
0.07
0.97
1.32
4.66
8.65
3.41
40.94
2.00
3.64
4.68
2
0.07
0.78
0.70
no
9.50
3.10
6.44
8.13
8.70
2.89
2
no
3.10
4
0.69
no
9.24
32
5.71
2
9.03
3.49
2.72
1.25
3.42
0.73
6.91
3.42
0.67
2.00
8.97
6.13
1.16
2.54
12.65
1.48
1.85
0.69
0.52
0.69
1.07
0.27
0.97
0.47
4.84
6.83
0.71
2.36
0.23
excellent
25.14
0.00
0
0
0
0
5
860.00
1,500.00
1,180.00
0
1
3
0
0
1
0
0
1
0
0
0
0
26.66
0.07
2
86600
5
94065
13
615.38
1,076.92
846.15
0
5
6
1
0
1
0
1
2
0
4
2
0
21.53
7.71
3
102910
7
127065
17
694.12
1,205.88
950.00
0
6
7
1
2
1
0
1
5
0
4
9
0
22.37
19.25
4
165510
8
179065
21
695.24
1,190.48
942.86
0
7
8
3
2
1
0
1
5
0
4
11
0
18.07
27.32
12
821986
14
491565
30
631.03
1,086.21
858.62
1
11
11
4
2
1
0
1
7
0
6
19
1
10.26
27.47
66
2690465
40
2034942
177
673.81
1,148.81
911.31
9
49
65
36
15
3
0
15
29
1
10
66
14
6000000
3
2011-08-27
43
29.00
2.00
nan
nan
nan
nan
nan
nan
Investment
Tekstil'shhiki
4,808,269.83
101708
0.11
0.12
5879
1,463.00
4
6207
5,580.00
7
0
1,183.00
1
0
5
1
no
0
0
1
no
no
no
yes
no
no
no
no
101982
46076
55906
13028
6835
6193
63388
31813
31575
25292
7609
17683
5879
3095
2784
6207
3269
2938
14884
7821
7063
19401
9045
10356
11252
5916
5336
330.00
59.00
0.00
0.00
206.00
4.00
60.00
0.00
1.00
0.00
330.00
1.00
0.00
246.00
63.00
20.00
3
2.12
1.64
17.35
1.45
0.05
0.16
0.37
0.45
0.94
4.91
3.38
12.00
1.28
15.33
3.00
1.28
1.70
3
0.33
3.95
0.47
no
5.60
2.93
6.96
8.05
9.07
0.65
3
no
2.93
4
0.70
no
9.54
5
6.71
3
5.78
7.51
0.77
1.60
3.68
3.56
5.75
1.38
0.73
1.24
1.98
0.77
1.95
0.62
7.68
0.10
0.84
1.51
1.49
1.54
0.39
0.16
3.18
0.76
7.92
4.27
3.16
4.96
0.19
poor
1.67
0.00
0
0
0
0
3
666.67
1,166.67
916.67
0
0
2
1
0
0
0
0
0
0
0
0
0
4.99
0.29
0
0
0
0
9
642.86
1,142.86
892.86
2
0
5
2
0
0
0
0
1
0
0
5
3
9.92
6.73
0
0
1
2600
14
516.67
916.67
716.67
2
4
6
2
0
0
0
0
4
0
0
6
5
12.99
12.75
4
100200
7
52550
24
563.64
977.27
770.45
2
8
9
4
1
0
0
0
4
0
0
8
5
12.14
26.46
8
110856
7
52550
41
697.44
1,192.31
944.87
2
9
17
9
3
1
0
0
11
0
0
20
6
13.69
21.58
43
1478160
35
1572990
122
702.68
1,196.43
949.55
10
29
45
25
10
3
0
11
27
0
4
67
10
5700000
4
2011-09-01
89
50.00
9.00
nan
nan
nan
nan
nan
nan
Investment
Mitino
12,583,535.69
178473
0.19
0.07
13087
6,839.00
9
13670
17,063.00
10
0
nan
1
0
17
6
no
0
11
4
no
no
no
no
no
no
no
no
21155
9828
11327
28563
14680
13883
120381
60040
60341
29529
9083
20446
13087
6645
6442
13670
7126
6544
32063
16513
15550
3292
1450
1842
24934
12782
12152
458.00
9.00
51.00
12.00
124.00
50.00
201.00
0.00
9.00
2.00
459.00
13.00
24.00
40.00
130.00
252.00
4
1.49
0.98
11.57
0.96
0.18
0.24
0.08
0.11
0.45
15.62
2.02
14.32
4.29
51.50
4.00
3.82
5.27
4
0.13
1.58
1.20
no
2.68
14.61
17.46
18.31
19.49
2.68
1
no
2.78
17
2.00
no
17.48
83
6.73
1
27.67
9.52
6.35
1.77
11.18
0.58
27.89
0.81
0.62
1.95
6.48
7.39
4.92
3.55
8.79
2.16
10.90
0.62
0.60
0.93
0.89
0.24
1.03
1.56
15.30
16.99
16.04
5.03
0.47
good
17.36
0.57
0
0
0
0
2
1,000.00
1,500.00
1,250.00
0
0
0
2
0
0
0
0
0
0
0
0
0
19.25
10.35
1
11000
6
80780
12
658.33
1,083.33
870.83
0
3
4
5
0
0
0
0
0
0
0
3
1
28.38
6.57
2
11000
7
89492
23
673.91
1,130.43
902.17
0
5
9
8
1
0
0
1
0
0
0
9
2
32.29
5.73
2
11000
7
89492
25
660.00
1,120.00
890.00
0
5
11
8
1
0
0
1
1
0
0
13
2
20.79
3.57
4
167000
12
205756
32
718.75
1,218.75
968.75
0
5
14
10
3
0
0
1
2
0
0
18
3
14.18
3.89
8
244166
22
942180
61
931.58
1,552.63
1,242.11
4
7
21
15
11
2
1
4
4
0
0
26
3
13100000
5
2011-09-05
77
77.00
4.00
nan
nan
nan
nan
nan
nan
Investment
Basmannoe
8,398,460.62
108171
0.02
0.04
5706
3,240.00
7
6748
7,770.00
9
0
562.00
4
2
25
2
no
0
10
93
no
no
no
yes
yes
no
no
no
28179
13522
14657
13368
7159
6209
68043
34236
33807
26760
8563
18197
5706
2982
2724
6748
3664
3084
15237
8113
7124
5164
2583
2581
11631
6223
5408
746.00
48.00
0.00
0.00
643.00
16.00
35.00
0.00
3.00
1.00
746.00
371.00
114.00
146.00
62.00
53.00
5
1.26
0.88
8.27
0.69
0.25
0.38
0.26
0.24
0.39
10.68
2.94
11.90
0.85
10.25
5.00
1.60
2.16
113
0.07
0.86
0.82
no
11.62
1.72
0.05
0.79
2.58
1.72
4
no
3.13
10
0.08
yes
1.60
113
1.42
4
6.52
8.67
1.64
3.63
4.59
2.61
9.16
1.97
0.22
2.54
3.98
3.61
0.31
1.86
3.78
1.12
0.99
0.89
0.43
0.08
0.81
0.38
0.38
0.12
2.58
1.11
1.80
1.34
0.03
excellent
3.56
4.44
15
293699
1
45000
48
702.22
1,166.67
934.44
3
17
10
11
7
0
0
1
4
0
2
3
0
3.34
8.29
46
420952
3
158200
153
763.45
1,272.41
1,017.93
8
39
45
39
19
2
1
7
12
0
6
7
0
4.12
4.83
93
1195735
9
445900
272
766.80
1,272.73
1,019.76
19
70
74
72
30
6
1
18
30
0
10
14
2
4.53
5.02
149
1625130
17
564843
483
765.93
1,269.23
1,017.58
28
130
129
131
50
14
1
35
61
0
17
21
3
5.06
8.62
305
3420907
60
2296870
1068
853.03
1,410.45
1,131.74
63
266
267
262
149
57
4
70
121
1
40
77
5
8.38
10.92
689
8404624
114
3503058
2283
853.88
1,411.45
1,132.66
143
566
578
552
319
108
17
135
236
2
91
195
14
16331452
In [48]:
fig, axs = plt.subplots(ncols=2)
sns.distplot(train_df['price_doc'], ax=axs[0])
sns.distplot(np.log(train_df['price_doc']), ax=axs[1])
plt.show()
In [49]:
fig, axs = plt.subplots(ncols=2)
fs = train_df['full_sq']
sns.distplot(fs[fs > 0], ax=axs[0])
sns.distplot(np.log(fs[fs > 0]), ax=axs[1])
plt.show()
In [50]:
train_df['price_doc'].describe()
Out[50]:
count 30,471.00
mean 7,123,035.28
std 4,780,111.33
min 100,000.00
25% 4,740,002.00
50% 6,274,411.00
75% 8,300,000.00
max 111,111,112.00
Name: price_doc, dtype: float64
In [75]:
train_df['year_month'] = train_df['timestamp'].apply(lambda x: str(x.year) + '-' + str(x.month).zfill(2))
In [ ]:
Content source: mbabd2016/russianhousingmarket
Similar notebooks: