In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [2]:
train = pd.read_csv("data/train.csv", index_col="id", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", index_col="id", parse_dates=["timestamp"])
macro = pd.read_csv("data/macro.csv", parse_dates=["timestamp"])

Data Review


In [5]:
train.head()


Out[5]:
timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state ... cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
id
1 2011-08-20 43 27.0 4.0 NaN NaN NaN NaN NaN NaN ... 9 4 0 13 22 1 0 52 4 5850000
2 2011-08-23 34 19.0 3.0 NaN NaN NaN NaN NaN NaN ... 15 3 0 15 29 1 10 66 14 6000000
3 2011-08-27 43 29.0 2.0 NaN NaN NaN NaN NaN NaN ... 10 3 0 11 27 0 4 67 10 5700000
4 2011-09-01 89 50.0 9.0 NaN NaN NaN NaN NaN NaN ... 11 2 1 4 4 0 0 26 3 13100000
5 2011-09-05 77 77.0 4.0 NaN NaN NaN NaN NaN NaN ... 319 108 17 135 236 2 91 195 14 16331452

5 rows × 291 columns


In [6]:
test.head()


Out[6]:
timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state ... cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000
id
30474 2015-07-01 39.0 20.7 2 9 1 1998.0 1 8.9 3.0 ... 8 0 0 0 1 10 1 0 14 1
30475 2015-07-01 79.2 NaN 8 17 1 0.0 3 1.0 1.0 ... 4 1 1 0 2 11 0 1 12 1
30476 2015-07-01 40.5 25.1 3 5 2 1960.0 2 4.8 2.0 ... 42 11 4 0 10 21 0 10 71 11
30477 2015-07-01 62.8 36.0 17 17 1 2016.0 2 62.8 3.0 ... 1 1 2 0 0 10 0 0 2 0
30478 2015-07-01 40.0 40.0 17 17 1 0.0 1 1.0 1.0 ... 5 1 1 0 2 12 0 1 11 1

5 rows × 290 columns


In [7]:
macro.head()


Out[7]:
timestamp oil_urals gdp_quart gdp_quart_growth cpi ppi gdp_deflator balance_trade balance_trade_growth usdrub ... provision_retail_space_modern_sqm turnover_catering_per_cap theaters_viewers_per_1000_cap seats_theather_rfmin_per_100000_cap museum_visitis_per_100_cap bandwidth_sports population_reg_sports_share students_reg_sports_share apartment_build apartment_fund_sqm
0 2010-01-01 76.1 NaN NaN NaN NaN NaN NaN NaN NaN ... 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN
1 2010-01-02 76.1 NaN NaN NaN NaN NaN NaN NaN NaN ... 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN
2 2010-01-03 76.1 NaN NaN NaN NaN NaN NaN NaN NaN ... 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN
3 2010-01-04 76.1 NaN NaN NaN NaN NaN NaN NaN 29.905 ... 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN
4 2010-01-05 76.1 NaN NaN NaN NaN NaN NaN NaN 29.836 ... 690.0 6221.0 527.0 0.41 993.0 NaN NaN 63.03 22825.0 NaN

5 rows × 100 columns


In [460]:
dtype_df = train.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
print("Variables data type:")
dtype_df.groupby("Column Type").aggregate('count').reset_index()


Variables data type:
Out[460]:
Column Type Count
0 datetime64[ns] 1
1 int64 155
2 float64 120
3 object 15

In [16]:
# more missing values in the train dataset for build_year, state and kitch_sq etc..
train.isnull().sum().sort_values(ascending=False)


Out[16]:
hospital_beds_raion                      14441
build_year                               13605
state                                    13559
cafe_sum_500_min_price_avg               13281
cafe_sum_500_max_price_avg               13281
cafe_avg_price_500                       13281
kitch_sq                                  9572
num_room                                  9572
material                                  9572
max_floor                                 9572
preschool_quota                           6688
school_quota                              6685
cafe_sum_1000_max_price_avg               6524
cafe_sum_1000_min_price_avg               6524
cafe_avg_price_1000                       6524
life_sq                                   6383
build_count_frame                         4991
build_count_after_1995                    4991
build_count_wood                          4991
build_count_block                         4991
raion_build_count_with_material_info      4991
build_count_panel                         4991
build_count_monolith                      4991
build_count_brick                         4991
build_count_before_1920                   4991
build_count_mix                           4991
build_count_1921-1945                     4991
build_count_foam                          4991
build_count_1946-1970                     4991
raion_build_count_with_builddate_info     4991
                                         ...  
cafe_count_1500_price_500                    0
cafe_count_1500_na_price                     0
cafe_count_1500                              0
trc_sqm_1500                                 0
trc_count_1500                               0
office_sqm_1500                              0
office_count_1500                            0
prom_part_1500                               0
green_part_1500                              0
market_count_1000                            0
sport_count_1000                             0
leisure_count_1000                           0
mosque_count_1000                            0
big_church_count_1000                        0
market_count_500                             0
cafe_count_1000_price_high                   0
cafe_count_1000_price_4000                   0
cafe_count_1000_price_2500                   0
cafe_count_1000_price_1500                   0
cafe_count_1000_price_1000                   0
cafe_count_1000_price_500                    0
cafe_count_1000_na_price                     0
cafe_count_1000                              0
trc_sqm_1000                                 0
trc_count_1000                               0
office_sqm_1000                              0
office_count_1000                            0
prom_part_1000                               0
green_part_1000                              0
timestamp                                    0
dtype: int64

In [17]:
test.isnull().sum().sort_values(ascending=False)


Out[17]:
hospital_beds_raion                      3418
cafe_avg_price_500                       3159
cafe_sum_500_max_price_avg               3159
cafe_sum_500_min_price_avg               3159
preschool_quota                          1596
school_quota                             1595
cafe_sum_1000_min_price_avg              1222
cafe_sum_1000_max_price_avg              1222
cafe_avg_price_1000                      1222
build_count_wood                         1218
build_count_1921-1945                    1218
build_count_after_1995                   1218
build_count_1971-1995                    1218
build_count_1946-1970                    1218
raion_build_count_with_material_info     1218
build_count_block                        1218
build_count_before_1920                  1218
build_count_frame                        1218
raion_build_count_with_builddate_info    1218
build_count_brick                        1218
build_count_mix                          1218
build_count_slag                         1218
build_count_monolith                     1218
build_count_foam                         1218
build_count_panel                        1218
life_sq                                  1176
build_year                               1049
cafe_sum_1500_min_price_avg               821
cafe_sum_1500_max_price_avg               821
cafe_avg_price_1500                       821
                                         ... 
cafe_count_1500_na_price                    0
cafe_count_1500                             0
trc_sqm_1500                                0
trc_count_1500                              0
office_sqm_1500                             0
office_count_1500                           0
prom_part_1500                              0
green_part_1500                             0
market_count_1000                           0
sport_count_1000                            0
leisure_count_1000                          0
mosque_count_1000                           0
church_count_1000                           0
big_church_count_1000                       0
cafe_count_1000_price_high                  0
cafe_count_1000_price_4000                  0
cafe_count_1000_price_2500                  0
cafe_count_1000_price_1500                  0
cafe_count_1000_price_1000                  0
cafe_count_1000_price_500                   0
cafe_count_1000_na_price                    0
cafe_count_1000                             0
trc_sqm_1000                                0
trc_count_1000                              0
office_sqm_1000                             0
office_count_1000                           0
prom_part_1000                              0
green_part_1000                             0
market_count_500                            0
timestamp                                   0
dtype: int64

In [23]:
train[["price_doc", "full_sq"]].plot(kind="scatter", x="full_sq", y="price_doc")
pass



In [24]:
# there is a very high price 1.111111e+08 -> probably outlier
train.price_doc.describe()


Out[24]:
count    3.047100e+04
mean     7.123035e+06
std      4.780111e+06
min      1.000000e+05
25%      4.740002e+06
50%      6.274411e+06
75%      8.300000e+06
max      1.111111e+08
Name: price_doc, dtype: float64

In [8]:
# full_sq has an outlier -> max 5326 remove it
train.full_sq.describe()


Out[8]:
count    30471.000000
mean        54.214269
std         38.031487
min          0.000000
25%         38.000000
50%         49.000000
75%         63.000000
max       5326.000000
Name: full_sq, dtype: float64

In [9]:
test.full_sq.describe()


Out[9]:
count    7662.000000
mean       53.701164
std        20.104320
min         0.000000
25%        39.200000
50%        50.410000
75%        63.300000
max       403.000000
Name: full_sq, dtype: float64

In [20]:
# values for full sq are rounded in the train dataset
train.full_sq.head()


Out[20]:
id
1    43
2    34
3    43
4    89
5    77
Name: full_sq, dtype: int64

In [21]:
test.full_sq.head()


Out[21]:
id
30474    39.0
30475    79.2
30476    40.5
30477    62.8
30478    40.0
Name: full_sq, dtype: float64

In [18]:
# remove the 7478 value
train.life_sq.describe()


Out[18]:
count    24088.000000
mean        34.403271
std         52.285733
min          0.000000
25%         20.000000
50%         30.000000
75%         43.000000
max       7478.000000
Name: life_sq, dtype: float64

In [19]:
test.life_sq.describe()


Out[19]:
count    6486.000000
mean       32.660037
std        22.737855
min         0.000000
25%        19.900000
50%        30.400000
75%        41.800000
max       869.000000
Name: life_sq, dtype: float64

In [38]:
# build year is very messy -> maybe every data which is not between 1691 and 2020 is NA
train.build_year.value_counts(dropna=False)


Out[38]:
NaN            7998
 2014.0         703
 2015.0         645
 1970.0         418
 0.0            413
 1969.0         407
 2013.0         391
 1968.0         389
 1967.0         384
 1965.0         378
 1972.0         360
 1974.0         357
 1971.0         352
 1966.0         348
 1960.0         344
 1962.0         338
 1973.0         333
 2016.0         330
 1963.0         325
 1964.0         315
 1975.0         309
 1961.0         297
 1976.0         263
 1977.0         260
 2006.0         242
 1.0            240
 1979.0         236
 1978.0         235
 2008.0         234
 2012.0         233
               ... 
 1890.0           5
 1912.0           5
 1947.0           4
 1924.0           3
 1949.0           3
 1914.0           3
 1943.0           2
 1860.0           2
 1907.0           2
 1900.0           2
 3.0              2
 1896.0           2
 1946.0           2
 1941.0           2
 1920.0           1
 1691.0           1
 1906.0           1
 1895.0           1
 1904.0           1
 71.0             1
 20052009.0       1
 1886.0           1
 1925.0           1
 4965.0           1
 1905.0           1
 1911.0           1
 2018.0           1
 215.0            1
 1948.0           1
 1876.0           1
Name: build_year, dtype: int64

In [39]:
test.build_year.value_counts(dropna=False)


Out[39]:
NaN        1049
 2017.0     413
 2016.0     400
 0.0        369
 2013.0     260
 2015.0     259
 2014.0     199
 1.0        187
 1969.0     141
 1966.0     138
 1968.0     137
 1967.0     133
 1971.0     133
 1970.0     122
 1960.0     118
 1972.0     118
 1973.0     116
 1964.0     114
 1962.0     111
 1965.0     109
 1974.0     109
 1961.0     102
 1963.0      98
 1975.0      97
 1977.0      96
 1978.0      94
 2006.0      92
 1976.0      86
 1980.0      82
 1979.0      81
           ... 
 1911.0       3
 1932.0       3
 1939.0       3
 1927.0       3
 1938.0       3
 1931.0       3
 1917.0       3
 1930.0       3
 1941.0       2
 1890.0       2
 1914.0       2
 1945.0       1
 1935.0       1
 1948.0       1
 1902.0       1
 2.0          1
 1906.0       1
 215.0        1
 1895.0       1
 1943.0       1
 1912.0       1
 1905.0       1
 1875.0       1
 1898.0       1
 1908.0       1
 1942.0       1
 1949.0       1
 1910.0       1
 1924.0       1
 1933.0       1
Name: build_year, dtype: int64

In [43]:
# error in 33 -> convert to 3 and also to ordinal variable
train.state.value_counts(dropna=False)


Out[43]:
NaN      8879
 2.0     5843
 3.0     5766
 1.0     3175
 4.0      421
 33.0       1
Name: state, dtype: int64

In [44]:
test.state.value_counts(dropna=False)


Out[44]:
 2.0    2662
 1.0    2266
 3.0    1913
NaN      694
 4.0     127
Name: state, dtype: int64

In [45]:
train.material.value_counts(dropna=False)


Out[45]:
 1.0    10660
NaN      7328
 2.0     2921
 5.0     1546
 4.0     1000
 6.0      630
Name: material, dtype: int64

In [46]:
test.material.value_counts(dropna=False)


Out[46]:
1    5241
2     958
4     619
5     487
6     356
3       1
Name: material, dtype: int64

In [61]:
# too high value for floor
train.floor.describe()


Out[61]:
count    23945.000000
mean         7.243475
std          5.117744
min          0.000000
25%          3.000000
50%          6.000000
75%         10.000000
max         77.000000
Name: floor, dtype: float64

In [62]:
test.floor.describe()


Out[62]:
count    7662.000000
mean        7.652571
std         5.099418
min         1.000000
25%         4.000000
50%         7.000000
75%        11.000000
max        41.000000
Name: floor, dtype: float64

In [65]:
train.max_floor.describe()


Out[65]:
count    16756.000000
mean        12.433815
std          6.310940
min          0.000000
25%          9.000000
50%         12.000000
75%         17.000000
max        117.000000
Name: max_floor, dtype: float64

In [66]:
test.max_floor.describe()


Out[66]:
count    7662.000000
mean       12.591099
std         6.659291
min         0.000000
25%         9.000000
50%        14.000000
75%        17.000000
max        48.000000
Name: max_floor, dtype: float64

In [68]:
train.product_type.value_counts(dropna=False)


Out[68]:
Investment       19368
OwnerOccupier     4576
Name: product_type, dtype: int64

In [69]:
# NaN in test for product type
test.product_type.value_counts(dropna=False)


Out[69]:
Investment       4998
OwnerOccupier    2631
NaN                33
Name: product_type, dtype: int64

In [715]:
# train prices are round values for investment type
train[["price_doc", "product_type"]][train.product_type == "Investment"].price_doc.value_counts(dropna=False).head(30)


Out[715]:
2000000    750
1000000    743
6000000    369
3000000    329
6500000    327
7000000    319
5500000    309
6300000    292
5000000    292
7500000    275
6200000    275
6400000    232
6800000    229
8500000    229
8000000    227
6100000    218
5800000    208
5900000    206
5700000    202
7200000    199
5300000    197
6900000    190
6600000    189
9000000    185
6700000    185
5600000    183
5200000    181
7400000    179
5400000    170
7300000    157
Name: price_doc, dtype: int64

In [78]:
# for owner types values are random
train[["price_doc", "product_type"]][~(train.product_type == "Investment")].price_doc.value_counts(dropna=False).head(10)


Out[78]:
4400000    25
3912000    12
3274574    12
2199436    11
4209255    10
6592000    10
3059062     9
4238000     9
6552000     9
3813607     9
Name: price_doc, dtype: int64

In [91]:
# missing values for train data
train.num_room.value_counts(dropna=False)


Out[91]:
NaN      7186
 2.0     6492
 1.0     5905
 3.0     3915
 4.0      381
 5.0       38
 0.0        8
 6.0        7
 8.0        3
 10.0       2
 7.0        1
 19.0       1
 9.0        1
 17.0       1
Name: num_room, dtype: int64

In [92]:
test.num_room.value_counts(dropna=False)


Out[92]:
2     3097
1     2855
3     1554
4      133
5       19
7        2
19       1
6        1
Name: num_room, dtype: int64

In [98]:
# differences float versus integer 
train.num_room.tail()


Out[98]:
id
30467    2.0
30469    2.0
30470    4.0
30472    2.0
30473    2.0
Name: num_room, dtype: float64

In [99]:
test.num_room.tail()


Out[99]:
id
38131    2
38132    2
38133    1
38134    1
38135    3
Name: num_room, dtype: int64

In [47]:
# check some basic errors
train[(train.full_sq - train.life_sq) < 0].timestamp.count()


Out[47]:
37

In [48]:
test[(test.full_sq - test.life_sq) < 0].timestamp.count()


Out[48]:
12

In [49]:
train[(train.full_sq - train.kitch_sq) < 0].timestamp.count()


Out[49]:
12

In [50]:
test[(test.full_sq - test.kitch_sq) < 0].timestamp.count()


Out[50]:
12

In [51]:
train[(train.life_sq - train.kitch_sq) < 0].timestamp.count()


Out[51]:
56

In [52]:
test[(test.life_sq - test.kitch_sq) < 0].timestamp.count()


Out[52]:
24

In [75]:
train[(train.max_floor - train.floor) < 0].timestamp.count()


Out[75]:
1484

In [76]:
test[(test.max_floor - test.floor) < 0].timestamp.count()


Out[76]:
643

In [227]:
# remove them
train[train.full_sq == 0]


Out[227]:
timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state ... cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
id
17935 2014-04-28 0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 ... 0 0 0 1 7 1 0 12 1 4425000

1 rows × 291 columns


In [229]:
test[test.full_sq == 0]


Out[229]:
timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state ... cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000
id
30938 2015-08-05 0.0 38.0 7 19 1 0.0 1 0.0 1.0 ... 3 1 0 0 1 6 0 0 6 1

1 rows × 290 columns


In [243]:
train[(train.full_sq == 0) | (train.full_sq == 1)].timestamp.count()


Out[243]:
26

In [244]:
test[(test.full_sq == 0) | (test.full_sq == 1)].timestamp.count()


Out[244]:
2

In [245]:
train[(train.life_sq == 0) | (train.life_sq == 1)].timestamp.count()


Out[245]:
435

In [246]:
test[(test.life_sq == 0) | (test.life_sq == 1)].timestamp.count()


Out[246]:
329

In [235]:
train[train.kitch_sq == 0].timestamp.count()


Out[235]:
1381

In [236]:
test[test.kitch_sq == 0].timestamp.count()


Out[236]:
376

In [237]:
# convert to NA
train[train.kitch_sq == 1].timestamp.count()


Out[237]:
4854

In [238]:
test[test.kitch_sq == 1].timestamp.count()


Out[238]:
1752

In [321]:
# replace 1968 and 1000 with missing values
test.kitch_sq.dropna().sort_values(ascending=False).head()


Out[321]:
id
31381    1968.0
36811    1000.0
36011     116.7
35339      86.3
34333      74.0
Name: kitch_sq, dtype: float64

In [365]:
# average room size
train[((train.full_sq / train.life_sq) > 0) > train.num_room]


Out[365]:
timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state ... cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
id
18126 2014-05-05 112 112.0 3.0 3.0 1.0 0.0 0.0 0.0 1.0 ... 10 3 0 10 9 0 0 38 4 7600002
18672 2014-05-20 209 209.0 0.0 3.0 4.0 2007.0 0.0 0.0 1.0 ... 1 0 0 1 7 0 0 4 1 10500000
21442 2014-08-12 74 41.0 12.0 0.0 1.0 NaN 0.0 12.0 NaN ... 0 0 0 1 7 1 0 12 1 8150340
23194 2014-10-01 84 84.0 3.0 15.0 2.0 0.0 0.0 0.0 1.0 ... 1 0 0 1 4 1 0 14 1 7580000
24883 2014-11-18 34 19.0 9.0 16.0 5.0 1982.0 0.0 9.0 3.0 ... 5 1 0 5 12 0 0 29 3 5400000
25375 2014-11-26 30 18.0 2.0 8.0 2.0 1961.0 0.0 6.0 2.0 ... 48 8 2 12 33 2 9 89 10 1000000
27457 2015-01-22 135 134.0 3.0 3.0 6.0 2013.0 0.0 0.0 1.0 ... 3 1 0 0 7 0 0 3 2 9308100

7 rows × 291 columns


In [363]:
train[train.num_room == 0].timestamp.count()


Out[363]:
14

In [364]:
test[test.num_room == 0].timestamp.count()


Out[364]:
0

In [367]:
train[((train.full_sq - train.life_sq) == 0)].timestamp.count()


Out[367]:
2750

In [368]:
test[((test.full_sq - test.life_sq) == 0)].timestamp.count()


Out[368]:
670

In [403]:
# remove those ones as num_room doesn't make any sense here
train[(train.full_sq > 0) & (train.life_sq == 1) & (train.num_room == 1)]


Out[403]:
timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state ... cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000 price_doc
id
10459 2013-09-12 35 1.0 9.0 16.0 1.0 NaN 1.0 1.0 NaN ... 1 0 0 1 6 1 0 13 1 3280650
10511 2013-09-14 46 1.0 3.0 17.0 4.0 NaN 1.0 1.0 1.0 ... 10 3 0 10 9 0 0 38 4 4121629
10531 2013-09-16 38 1.0 2.0 17.0 1.0 NaN 1.0 1.0 1.0 ... 1 0 0 2 3 0 0 6 1 4294675
10644 2013-09-21 41 1.0 2.0 1.0 6.0 NaN 1.0 41.0 1.0 ... 19 7 2 3 17 0 0 51 3 7529400
10727 2013-09-25 46 1.0 12.0 1.0 1.0 NaN 1.0 1.0 1.0 ... 2 2 0 1 12 1 0 8 0 4244188
10891 2013-10-01 34 1.0 8.0 1.0 1.0 NaN 1.0 1.0 1.0 ... 2 2 0 1 12 1 0 8 0 3795000
10897 2013-10-01 38 1.0 7.0 17.0 1.0 NaN 1.0 1.0 1.0 ... 1 0 0 4 6 0 0 5 1 4033260
11160 2013-10-10 43 1.0 13.0 13.0 6.0 NaN 1.0 40.0 1.0 ... 11 2 1 5 4 0 1 32 5 6600879
11532 2013-10-24 46 1.0 25.0 25.0 6.0 2015.0 1.0 1.0 1.0 ... 1 0 0 2 3 0 0 6 1 4966206
11555 2013-10-25 38 1.0 6.0 12.0 1.0 NaN 1.0 1.0 NaN ... 2 1 0 0 10 0 0 0 0 3362009
11670 2013-10-29 37 1.0 9.0 17.0 1.0 NaN 1.0 1.0 NaN ... 1 0 0 2 3 0 0 6 1 4138020
11687 2013-10-30 40 1.0 25.0 1.0 4.0 2013.0 1.0 1.0 1.0 ... 13 4 1 5 7 0 1 38 5 7259647
11791 2013-11-05 46 1.0 11.0 25.0 1.0 2015.0 1.0 1.0 1.0 ... 1 0 0 2 3 0 0 6 1 4740002
11814 2013-11-05 116 1.0 2.0 22.0 1.0 2013.0 1.0 1.0 1.0 ... 8 1 0 11 13 1 3 60 2 16658964
11918 2013-11-07 48 1.0 3.0 17.0 1.0 NaN 1.0 1.0 1.0 ... 3 2 0 2 16 1 0 20 4 4360150
11921 2013-11-07 121 1.0 1.0 3.0 5.0 NaN 1.0 1.0 1.0 ... 2 2 0 2 9 0 0 7 3 7904621
11948 2013-11-08 40 1.0 16.0 17.0 1.0 NaN 1.0 1.0 1.0 ... 10 3 0 10 9 0 0 38 4 4033800
11969 2013-11-08 37 1.0 2.0 30.0 4.0 2014.0 1.0 1.0 1.0 ... 15 3 0 11 11 0 1 54 5 7504715
12127 2013-11-14 37 1.0 15.0 17.0 1.0 NaN 1.0 1.0 1.0 ... 241 82 16 81 159 1 66 174 12 3886596
12435 2013-11-25 31 1.0 22.0 25.0 1.0 NaN 1.0 1.0 1.0 ... 2 2 0 1 12 1 0 8 0 3828000
12807 2013-12-04 33 1.0 21.0 25.0 1.0 NaN 1.0 1.0 1.0 ... 2 2 0 1 12 1 0 8 0 3771264
12984 2013-12-11 97 1.0 23.0 1.0 4.0 2015.0 1.0 1.0 1.0 ... 25 3 0 12 28 0 7 84 5 15470484
13043 2013-12-12 38 1.0 3.0 17.0 1.0 NaN 1.0 1.0 1.0 ... 1 0 0 2 3 0 0 6 1 4427500
13092 2013-12-13 46 1.0 6.0 25.0 1.0 NaN 1.0 1.0 1.0 ... 1 0 0 2 3 0 0 6 1 4740002
13174 2013-12-17 48 1.0 25.0 25.0 1.0 NaN 1.0 1.0 1.0 ... 1 0 0 4 6 0 0 5 1 4966206
13290 2013-12-20 45 1.0 5.0 1.0 1.0 2014.0 1.0 1.0 1.0 ... 36 3 1 6 31 1 6 77 6 7406640
13329 2013-12-23 44 1.0 7.0 1.0 1.0 1.0 1.0 1.0 1.0 ... 0 0 0 1 3 0 2 10 1 4664000
13540 2013-12-30 44 1.0 15.0 17.0 1.0 2014.0 1.0 1.0 1.0 ... 32 11 1 8 27 2 5 70 4 7190400
13562 2013-12-31 39 1.0 8.0 17.0 1.0 NaN 1.0 1.0 1.0 ... 2 2 0 1 12 1 0 8 0 4155840
13622 2014-01-10 38 1.0 7.0 17.0 1.0 NaN 1.0 1.0 1.0 ... 2 1 0 0 10 0 0 0 0 3513890
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
28547 2015-03-23 56 1.0 8.0 20.0 4.0 2012.0 1.0 1.0 1.0 ... 17 2 0 7 32 1 6 64 11 11333414
28592 2015-03-24 42 1.0 6.0 1.0 1.0 NaN 1.0 1.0 1.0 ... 6 1 0 5 13 0 1 29 4 4525950
28689 2015-03-28 46 1.0 2.0 25.0 1.0 1.0 1.0 1.0 1.0 ... 13 4 1 5 7 0 1 38 5 6112746
28802 2015-04-01 49 1.0 5.0 1.0 1.0 2017.0 1.0 1.0 1.0 ... 2 2 0 3 13 1 0 9 3 4575600
28991 2015-04-08 32 1.0 2.0 19.0 1.0 NaN 1.0 1.0 1.0 ... 371 141 26 150 249 2 105 203 13 3541484
29039 2015-04-10 38 1.0 9.0 17.0 1.0 2015.0 1.0 1.0 1.0 ... 1 0 0 3 2 0 0 3 1 5490262
29040 2015-04-10 38 1.0 16.0 17.0 1.0 2015.0 1.0 1.0 1.0 ... 1 1 0 2 12 0 1 10 0 5810343
29054 2015-04-10 47 1.0 23.0 1.0 1.0 2016.0 1.0 1.0 1.0 ... 103 58 13 22 40 1 15 127 7 10176494
29066 2015-04-11 34 1.0 1.0 3.0 1.0 2015.0 1.0 1.0 1.0 ... 371 141 26 150 249 2 105 203 13 1858668
29199 2015-04-17 45 1.0 8.0 15.0 6.0 2015.0 1.0 1.0 1.0 ... 2 2 0 2 9 0 0 5 3 4267900
29647 2015-05-12 36 1.0 4.0 17.0 1.0 NaN 1.0 1.0 1.0 ... 2 1 0 0 10 0 0 0 0 4088072
29656 2015-05-12 37 1.0 12.0 22.0 1.0 NaN 1.0 1.0 1.0 ... 371 141 26 150 249 2 105 203 13 3684562
29662 2015-05-12 32 1.0 12.0 19.0 1.0 2016.0 1.0 1.0 1.0 ... 1 0 0 1 6 0 0 6 1 3708032
29690 2015-05-13 49 1.0 8.0 15.0 6.0 2015.0 1.0 1.0 1.0 ... 2 2 0 2 9 0 0 7 2 4272800
29766 2015-05-18 50 1.0 18.0 20.0 4.0 2015.0 1.0 1.0 1.0 ... 12 7 2 3 15 0 0 23 0 7350000
29946 2015-05-26 34 1.0 15.0 17.0 1.0 2016.0 1.0 1.0 1.0 ... 2 1 0 0 10 0 0 0 0 3865945
29997 2015-05-28 42 1.0 10.0 12.0 5.0 2015.0 1.0 1.0 1.0 ... 2 2 0 3 12 0 0 6 3 4594590
30035 2015-05-29 54 1.0 2.0 3.0 1.0 NaN 1.0 1.0 1.0 ... 371 141 26 150 249 2 105 203 13 3053307
30054 2015-05-30 41 1.0 15.0 1.0 4.0 NaN 1.0 1.0 1.0 ... 7 2 0 5 17 0 2 48 6 7375275
30076 2015-06-01 27 1.0 2.0 9.0 1.0 2015.0 1.0 1.0 1.0 ... 1 0 0 0 6 0 0 0 0 2422315
30092 2015-06-02 36 1.0 11.0 15.0 1.0 2017.0 1.0 1.0 1.0 ... 2 2 0 2 9 0 0 7 2 3610000
30097 2015-06-03 49 1.0 12.0 1.0 1.0 1.0 1.0 1.0 1.0 ... 2 2 0 2 10 0 0 6 3 4364990
30254 2015-06-13 39 1.0 13.0 17.0 1.0 2015.0 1.0 1.0 1.0 ... 7 2 0 5 14 0 3 17 2 4583565
30320 2015-06-18 47 1.0 8.0 1.0 1.0 2016.0 1.0 1.0 1.0 ... 99 57 12 23 42 1 13 123 7 9475779
30344 2015-06-20 50 1.0 22.0 1.0 1.0 2015.0 1.0 1.0 1.0 ... 12 6 2 4 15 0 0 22 0 7350000
30356 2015-06-20 37 1.0 9.0 1.0 1.0 NaN 1.0 1.0 1.0 ... 1 2 0 0 7 0 0 2 0 3596400
30378 2015-06-23 38 1.0 16.0 17.0 1.0 NaN 1.0 1.0 1.0 ... 10 1 0 6 11 0 3 37 4 5477975
30394 2015-06-24 41 1.0 3.0 1.0 1.0 1.0 1.0 1.0 1.0 ... 0 0 0 1 3 0 2 10 1 4480333
30456 2015-06-29 45 1.0 6.0 15.0 6.0 2015.0 1.0 1.0 1.0 ... 2 1 0 3 8 1 0 20 3 4066740
30461 2015-06-29 50 1.0 7.0 7.0 1.0 2015.0 1.0 0.0 1.0 ... 13 1 0 5 15 0 3 35 3 6370777

181 rows × 291 columns


In [376]:
test[(test.full_sq > 0) & (test.life_sq == 1) & (test.num_room == 1)]


Out[376]:
timestamp full_sq life_sq floor max_floor material build_year num_room kitch_sq state ... cafe_count_5000_price_1500 cafe_count_5000_price_2500 cafe_count_5000_price_4000 cafe_count_5000_price_high big_church_count_5000 church_count_5000 mosque_count_5000 leisure_count_5000 sport_count_5000 market_count_5000
id
30518 2015-07-03 35.82 1.0 14 1 1 2015.0 1 1.0 1.0 ... 94 72 40 9 18 36 1 9 107 6
30549 2015-07-06 43.30 1.0 19 22 1 2015.0 1 1.0 1.0 ... 4 1 1 0 2 11 0 1 12 1
30662 2015-07-15 56.80 1.0 10 25 1 2015.0 1 1.0 1.0 ... 18 11 2 1 5 4 0 1 32 5
30793 2015-07-27 67.67 1.0 21 1 1 2016.0 1 1.0 1.0 ... 123 91 50 12 22 38 1 11 122 7
30846 2015-07-29 39.20 1.0 11 17 1 2015.0 1 1.0 1.0 ... 12 7 2 0 5 14 0 3 17 2
30855 2015-07-30 48.19 1.0 4 1 1 2017.0 1 1.0 1.0 ... 41 25 3 0 14 28 1 4 92 6
30898 2015-08-04 43.34 1.0 17 25 1 2016.0 1 1.0 1.0 ... 18 11 2 1 5 4 0 1 32 5
30935 2015-08-05 43.06 1.0 6 25 1 2016.0 1 1.0 1.0 ... 18 11 2 1 5 4 0 1 32 5
30940 2015-08-06 39.59 1.0 3 18 1 2015.0 1 1.0 1.0 ... 40 13 3 0 9 29 0 7 63 13
30977 2015-08-08 41.39 1.0 3 1 1 2017.0 1 1.0 1.0 ... 41 25 3 0 14 29 1 4 90 6
31039 2015-08-13 46.99 1.0 6 1 1 2015.0 1 1.0 1.0 ... 94 72 40 9 18 36 1 9 107 6
31180 2015-08-21 41.90 1.0 6 15 1 2015.0 1 1.0 1.0 ... 4 2 2 0 2 9 0 0 7 2
31182 2015-08-21 39.90 1.0 14 1 1 2015.0 1 1.0 1.0 ... 61 17 6 0 13 21 0 2 81 11
31187 2015-08-21 37.80 1.0 12 19 1 2016.0 1 1.0 1.0 ... 3 1 0 0 1 6 0 0 6 1
31189 2015-08-21 42.91 1.0 3 9 1 2016.0 1 1.0 1.0 ... 4 1 1 0 2 11 0 1 12 1
31231 2015-08-24 34.80 1.0 12 15 6 2016.0 1 10.0 1.0 ... 5 2 1 0 2 9 0 0 7 0
31232 2015-08-24 39.59 1.0 2 18 1 2015.0 1 1.0 1.0 ... 37 15 3 0 15 28 0 8 67 15
31297 2015-08-26 46.35 1.0 7 1 1 2015.0 1 1.0 1.0 ... 238 107 28 1 74 125 1 33 139 12
31335 2015-08-28 35.72 1.0 18 1 1 2015.0 1 1.0 1.0 ... 94 72 40 9 18 36 1 9 107 6
31397 2015-09-02 43.92 1.0 2 1 1 2017.0 1 1.0 1.0 ... 41 25 3 0 14 28 1 4 92 6
31475 2015-09-07 41.83 1.0 18 25 1 1.0 1 1.0 1.0 ... 62 30 5 1 5 23 1 3 67 4
31556 2015-09-10 43.92 1.0 4 1 1 2017.0 1 1.0 1.0 ... 41 25 3 0 14 28 1 4 92 6
31560 2015-09-10 43.92 1.0 9 1 1 2015.0 1 1.0 1.0 ... 41 25 3 0 14 28 1 4 92 6
31569 2015-09-11 42.73 1.0 6 25 1 2015.0 1 1.0 1.0 ... 18 11 2 1 5 4 0 1 32 5
31652 2015-09-16 50.81 1.0 1 17 1 NaN 1 1.0 1.0 ... 22 13 1 0 5 15 0 3 35 3
31690 2015-09-17 39.75 1.0 15 1 1 2015.0 1 1.0 1.0 ... 41 25 3 0 14 28 1 4 92 6
31725 2015-09-19 40.00 1.0 14 1 1 2015.0 1 1.0 1.0 ... 5 1 1 0 2 12 0 1 11 1
31770 2015-09-22 41.67 1.0 4 17 1 2015.0 1 1.0 1.0 ... 1 1 0 0 0 5 0 0 3 1
32036 2015-10-05 43.92 1.0 2 1 1 2015.0 1 1.0 1.0 ... 41 25 3 0 14 28 1 4 92 6
32072 2015-10-06 41.60 1.0 7 1 1 2014.0 1 1.0 1.0 ... 3 0 0 0 1 3 0 2 10 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
36812 2016-03-19 36.50 1.0 4 17 1 NaN 1 1.0 1.0 ... 2 3 1 0 0 7 0 0 3 2
36856 2016-03-21 39.60 1.0 5 9 6 2017.0 1 1.0 1.0 ... 6 3 1 0 3 9 0 1 10 2
36861 2016-03-21 41.71 1.0 11 1 1 1.0 1 1.0 1.0 ... 636 371 141 26 150 249 2 105 203 13
37083 2016-03-28 32.00 1.0 5 17 1 NaN 1 1.0 1.0 ... 1 2 1 0 0 8 0 0 2 1
37187 2016-03-30 45.26 1.0 4 17 1 NaN 1 1.0 1.0 ... 3 2 1 0 2 6 0 0 2 0
37202 2016-03-30 97.20 1.0 15 1 1 2017.0 1 1.0 1.0 ... 140 101 58 12 22 39 1 13 124 7
37245 2016-03-31 40.80 1.0 5 11 1 NaN 1 1.0 1.0 ... 5 1 1 0 2 12 0 0 9 0
37257 2016-04-01 60.64 1.0 4 17 1 NaN 1 1.0 1.0 ... 26 7 3 0 8 18 0 8 71 10
37297 2016-04-04 38.80 1.0 15 1 1 2017.0 1 1.0 1.0 ... 6 3 1 0 3 9 0 1 10 2
37319 2016-04-04 58.31 1.0 16 1 1 2016.0 1 1.0 1.0 ... 8 0 0 0 1 7 1 0 12 1
37457 2016-04-08 33.89 1.0 4 1 1 1.0 1 1.0 1.0 ... 3 0 0 0 1 3 0 2 10 2
37520 2016-04-12 41.08 1.0 3 1 1 1.0 1 1.0 1.0 ... 3 0 0 0 1 3 0 2 10 2
37691 2016-04-19 38.77 1.0 10 1 1 1.0 1 1.0 1.0 ... 636 371 141 26 150 249 2 105 203 13
37747 2016-04-21 36.90 1.0 14 18 4 2017.0 1 1.0 1.0 ... 36 13 3 0 9 30 0 7 68 15
37807 2016-04-25 41.08 1.0 10 1 1 1.0 1 1.0 1.0 ... 636 371 141 26 150 249 2 105 203 13
37891 2016-04-28 34.40 1.0 4 13 1 2017.0 1 1.0 1.0 ... 447 283 118 23 87 159 1 74 163 14
37959 2016-05-05 43.71 1.0 7 1 1 2017.0 1 1.0 1.0 ... 39 24 3 0 16 28 1 3 93 6
37960 2016-05-05 42.94 1.0 22 25 1 NaN 1 1.0 1.0 ... 27 7 3 0 8 18 0 8 72 10
37976 2016-05-06 39.72 1.0 5 1 1 1.0 1 1.0 1.0 ... 3 0 0 0 1 3 0 2 10 2
37991 2016-05-06 42.94 1.0 21 25 1 NaN 1 1.0 1.0 ... 27 7 3 0 8 18 0 8 72 10
38004 2016-05-10 37.75 1.0 21 25 1 NaN 1 1.0 1.0 ... 347 182 70 8 104 170 2 47 150 12
38057 2016-05-14 38.77 1.0 7 1 1 1.0 1 1.0 1.0 ... 636 371 141 26 150 249 2 105 203 13
38060 2016-05-14 43.71 1.0 8 1 1 2016.0 1 1.0 1.0 ... 433 266 102 21 89 167 1 72 168 13
38109 2016-05-20 42.23 1.0 11 17 1 NaN 1 1.0 1.0 ... 27 7 3 0 8 18 0 8 72 10
38122 2016-05-23 33.89 1.0 3 1 1 1.0 1 1.0 NaN ... 526 298 98 16 130 228 2 89 185 14
38127 2016-05-25 19.91 1.0 8 1 1 1.0 1 1.0 NaN ... 526 298 98 16 130 228 2 89 185 14
38128 2016-05-25 23.56 1.0 1 1 1 1.0 1 1.0 1.0 ... 0 0 0 0 0 4 0 0 0 2
38129 2016-05-25 41.68 1.0 13 17 1 NaN 1 1.0 1.0 ... 1 1 0 0 0 5 0 0 3 1
38130 2016-05-26 50.28 1.0 3 17 1 NaN 1 1.0 1.0 ... 17 7 0 0 4 13 0 2 29 2
38133 2016-05-30 41.08 1.0 12 1 1 1.0 1 1.0 1.0 ... 636 371 141 26 150 249 2 105 203 13

154 rows × 290 columns


In [ ]:

Data Cleaning


In [374]:
train = pd.read_csv("data/train.csv", index_col="id", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", index_col="id", parse_dates=["timestamp"])

In [375]:
train.timestamp.count()


Out[375]:
30471

In [376]:
## filtering
train = train[train.full_sq < 500]
train = train[train.price_doc < 1.111111e+08]
train = train[~((train.full_sq - train.life_sq) < 0)]
train = train[~((train.full_sq - train.kitch_sq) < 0)]
train = train[~((train.life_sq - train.kitch_sq) < 0)]
train = train[~((train.max_floor - train.floor) < 0)]
train = train[~(train.full_sq == 0)]
train = train[~((train.full_sq > 0) & (train.life_sq == 1) & (train.num_room == 1))]

train.loc[train.life_sq > train.full_sq, "life_sq"] = np.NaN
test.loc[test.life_sq > test.full_sq, "life_sq"] = np.NaN

index = [31075, 32370, 33265]
test.loc[index, "life_sq"] = test.loc[index, "full_sq"]

#index = [13120, 28737, 10371, 21418]
#train.loc[index, "build_year"] = train.loc[index, "kitch_sq"]
index = [31381]
test.loc[index, "build_year"] = test.loc[index, "kitch_sq"]

#train.loc[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3), "full_sq"] = np.NaN
#test.loc[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3), "full_sq"] = np.NaN

train.loc[train.life_sq > 300, ["life_sq", "full_sq"]] = np.NaN
test.loc[test.life_sq > 200, ["life_sq", "full_sq"]] = np.NaN

train.loc[train.build_year == 20052009.0, "build_year"] = 2007

train.loc[train.num_room == 0, "num_room"] = np.NaN
test.loc[test.num_room == 0, "num_room"] = np.NaN

index = [10079, 11624, 17767, 19393, 24010, 26716, 29175]
train.loc[index, "num_room"] = np.NaN
index = [33648, 37787]
test.loc[index, "num_room"] = np.NaN

train.loc[(train.floor == 0) & (train.max_floor == 0), ["max_floor", "floor"]] = np.NaN
test.loc[(test.floor == 0) & (test.max_floor == 0), ["max_floor", "floor"]] = np.NaN

train.loc[train.floor == 0, "floor"] = np.NaN
test.loc[test.floor == 0, "floor"] = np.NaN
train.loc[train.max_floor == 0, "max_floor"] = np.NaN
test.loc[test.max_floor == 0, "max_floor"] = np.NaN

train.loc[train.state == 33, "state"] = np.NaN

train.loc[train.floor > train.max_floor, "max_floor"] = np.NaN
test.loc[test.floor > test.max_floor, "max_floor"] = np.NaN

train.loc[train.full_sq == 0, "full_sq"] = np.NaN
train.loc[train.full_sq == 1, "full_sq"] = np.NaN
train.loc[train.life_sq == 0, "life_sq"] = np.NaN
train.loc[train.life_sq == 1, "life_sq"] = np.NaN
train.loc[train.kitch_sq == 0, "kitch_sq"] = np.NaN
train.loc[train.kitch_sq == 1, "kitch_sq"] = np.NaN
train.loc[train.build_year < 1000, "build_year"] = np.NaN
train.loc[train.build_year > 2030, "build_year"] = np.NaN

test.loc[test.full_sq == 0, "full_sq"] = np.NaN
test.loc[test.full_sq == 1, "full_sq"] = np.NaN
test.loc[test.life_sq == 0, "life_sq"] = np.NaN
test.loc[test.life_sq == 1, "life_sq"] = np.NaN
test.loc[test.kitch_sq == 0, "kitch_sq"] = np.NaN
test.loc[test.kitch_sq == 1, "kitch_sq"] = np.NaN
test.loc[test.kitch_sq == 1968, "kitch_sq"] = np.NaN
test.loc[test.kitch_sq == 1000, "kitch_sq"] = np.NaN
test.loc[test.build_year < 1000, "build_year"] = np.NaN
test.loc[test.build_year > 2030, "build_year"] = np.NaN

## round up test
test.full_sq = test.full_sq.round()
test.life_sq = test.life_sq.round()

In [377]:
train.timestamp.count()


Out[377]:
28747

In [378]:
train.full_sq.describe()


Out[378]:
count    28744.00000
mean        53.61171
std         21.52224
min          6.00000
25%         38.00000
50%         49.00000
75%         63.00000
max        461.00000
Name: full_sq, dtype: float64

In [379]:
test.full_sq.describe()


Out[379]:
count    7659.000000
mean       53.689124
std        19.992401
min         2.000000
25%        39.000000
50%        50.000000
75%        63.000000
max       403.000000
Name: full_sq, dtype: float64

In [380]:
train.life_sq.describe()


Out[380]:
count    23152.000000
mean        34.196182
std         17.983174
min          7.000000
25%         20.000000
50%         30.000000
75%         43.000000
max        232.000000
Name: life_sq, dtype: float64

In [381]:
test.life_sq.describe()


Out[381]:
count    6147.000000
mean       33.930861
std        15.893917
min         2.000000
25%        21.000000
50%        31.000000
75%        43.000000
max       197.000000
Name: life_sq, dtype: float64

In [382]:
train.kitch_sq.describe()


Out[382]:
count    14469.000000
mean         8.089225
std          3.832581
min          2.000000
25%          6.000000
50%          8.000000
75%         10.000000
max        123.000000
Name: kitch_sq, dtype: float64

In [383]:
test.kitch_sq.describe()


Out[383]:
count    5532.000000
mean        8.757055
std         3.865231
min         1.100000
25%         6.300000
50%         8.500000
75%        10.000000
max       116.700000
Name: kitch_sq, dtype: float64

In [384]:
train.floor.describe()


Out[384]:
count    28573.000000
mean         7.611136
std          5.266894
min          1.000000
25%          3.000000
50%          6.000000
75%         11.000000
max         44.000000
Name: floor, dtype: float64

In [385]:
test.floor.describe()


Out[385]:
count    7662.000000
mean        7.652571
std         5.099418
min         1.000000
25%         4.000000
50%         7.000000
75%        11.000000
max        41.000000
Name: floor, dtype: float64

In [386]:
train.build_year.value_counts(dropna=False).head()


Out[386]:
NaN        13287
 2014.0      742
 2015.0      713
 1970.0      412
 1969.0      406
Name: build_year, dtype: int64

In [387]:
test.build_year.value_counts(dropna=False).head()


Out[387]:
NaN        1607
 2017.0     413
 2016.0     400
 2013.0     260
 2015.0     259
Name: build_year, dtype: int64

In [388]:
train.build_year.describe()


Out[388]:
count    15460.000000
mean      1984.620052
std         20.722733
min       1691.000000
25%       1968.000000
50%       1980.000000
75%       2005.000000
max       2018.000000
Name: build_year, dtype: float64

In [389]:
test.build_year.describe()


Out[389]:
count    6055.000000
mean     1988.625764
std        21.899159
min      1875.000000
25%      1969.000000
50%      1986.000000
75%      2013.000000
max      2019.000000
Name: build_year, dtype: float64

In [390]:
train.state.value_counts(dropna=False)


Out[390]:
NaN     12777
 2.0     5794
 3.0     5710
 1.0     4051
 4.0      415
Name: state, dtype: int64

In [391]:
train.max_floor.describe()


Out[391]:
count    19188.000000
mean        13.450855
std          6.099241
min          1.000000
25%          9.000000
50%         14.000000
75%         17.000000
max        117.000000
Name: max_floor, dtype: float64

In [392]:
test.max_floor.describe()


Out[392]:
count    7019.000000
mean       13.666192
std         5.866484
min         1.000000
25%         9.000000
50%        14.000000
75%        17.000000
max        48.000000
Name: max_floor, dtype: float64

In [393]:
train[train.num_room == 0].timestamp.count()


Out[393]:
0

In [394]:
train.to_csv("data/train_clean.csv")
test.to_csv("data/test_clean.csv")

In [ ]:


In [ ]: