In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline
In [2]:
train = pd.read_csv("data/train.csv", index_col="id", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", index_col="id", parse_dates=["timestamp"])
macro = pd.read_csv("data/macro.csv", parse_dates=["timestamp"])
In [5]:
train.head()
Out[5]:
In [6]:
test.head()
Out[6]:
In [7]:
macro.head()
Out[7]:
In [460]:
dtype_df = train.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
print("Variables data type:")
dtype_df.groupby("Column Type").aggregate('count').reset_index()
Out[460]:
In [16]:
# more missing values in the train dataset for build_year, state and kitch_sq etc..
train.isnull().sum().sort_values(ascending=False)
Out[16]:
In [17]:
test.isnull().sum().sort_values(ascending=False)
Out[17]:
In [23]:
train[["price_doc", "full_sq"]].plot(kind="scatter", x="full_sq", y="price_doc")
pass
In [24]:
# there is a very high price 1.111111e+08 -> probably outlier
train.price_doc.describe()
Out[24]:
In [8]:
# full_sq has an outlier -> max 5326 remove it
train.full_sq.describe()
Out[8]:
In [9]:
test.full_sq.describe()
Out[9]:
In [20]:
# values for full sq are rounded in the train dataset
train.full_sq.head()
Out[20]:
In [21]:
test.full_sq.head()
Out[21]:
In [18]:
# remove the 7478 value
train.life_sq.describe()
Out[18]:
In [19]:
test.life_sq.describe()
Out[19]:
In [38]:
# build year is very messy -> maybe every data which is not between 1691 and 2020 is NA
train.build_year.value_counts(dropna=False)
Out[38]:
In [39]:
test.build_year.value_counts(dropna=False)
Out[39]:
In [43]:
# error in 33 -> convert to 3 and also to ordinal variable
train.state.value_counts(dropna=False)
Out[43]:
In [44]:
test.state.value_counts(dropna=False)
Out[44]:
In [45]:
train.material.value_counts(dropna=False)
Out[45]:
In [46]:
test.material.value_counts(dropna=False)
Out[46]:
In [61]:
# too high value for floor
train.floor.describe()
Out[61]:
In [62]:
test.floor.describe()
Out[62]:
In [65]:
train.max_floor.describe()
Out[65]:
In [66]:
test.max_floor.describe()
Out[66]:
In [68]:
train.product_type.value_counts(dropna=False)
Out[68]:
In [69]:
# NaN in test for product type
test.product_type.value_counts(dropna=False)
Out[69]:
In [715]:
# train prices are round values for investment type
train[["price_doc", "product_type"]][train.product_type == "Investment"].price_doc.value_counts(dropna=False).head(30)
Out[715]:
In [78]:
# for owner types values are random
train[["price_doc", "product_type"]][~(train.product_type == "Investment")].price_doc.value_counts(dropna=False).head(10)
Out[78]:
In [91]:
# missing values for train data
train.num_room.value_counts(dropna=False)
Out[91]:
In [92]:
test.num_room.value_counts(dropna=False)
Out[92]:
In [98]:
# differences float versus integer
train.num_room.tail()
Out[98]:
In [99]:
test.num_room.tail()
Out[99]:
In [47]:
# check some basic errors
train[(train.full_sq - train.life_sq) < 0].timestamp.count()
Out[47]:
In [48]:
test[(test.full_sq - test.life_sq) < 0].timestamp.count()
Out[48]:
In [49]:
train[(train.full_sq - train.kitch_sq) < 0].timestamp.count()
Out[49]:
In [50]:
test[(test.full_sq - test.kitch_sq) < 0].timestamp.count()
Out[50]:
In [51]:
train[(train.life_sq - train.kitch_sq) < 0].timestamp.count()
Out[51]:
In [52]:
test[(test.life_sq - test.kitch_sq) < 0].timestamp.count()
Out[52]:
In [75]:
train[(train.max_floor - train.floor) < 0].timestamp.count()
Out[75]:
In [76]:
test[(test.max_floor - test.floor) < 0].timestamp.count()
Out[76]:
In [227]:
# remove them
train[train.full_sq == 0]
Out[227]:
In [229]:
test[test.full_sq == 0]
Out[229]:
In [243]:
train[(train.full_sq == 0) | (train.full_sq == 1)].timestamp.count()
Out[243]:
In [244]:
test[(test.full_sq == 0) | (test.full_sq == 1)].timestamp.count()
Out[244]:
In [245]:
train[(train.life_sq == 0) | (train.life_sq == 1)].timestamp.count()
Out[245]:
In [246]:
test[(test.life_sq == 0) | (test.life_sq == 1)].timestamp.count()
Out[246]:
In [235]:
train[train.kitch_sq == 0].timestamp.count()
Out[235]:
In [236]:
test[test.kitch_sq == 0].timestamp.count()
Out[236]:
In [237]:
# convert to NA
train[train.kitch_sq == 1].timestamp.count()
Out[237]:
In [238]:
test[test.kitch_sq == 1].timestamp.count()
Out[238]:
In [321]:
# replace 1968 and 1000 with missing values
test.kitch_sq.dropna().sort_values(ascending=False).head()
Out[321]:
In [365]:
# average room size
train[((train.full_sq / train.life_sq) > 0) > train.num_room]
Out[365]:
In [363]:
train[train.num_room == 0].timestamp.count()
Out[363]:
In [364]:
test[test.num_room == 0].timestamp.count()
Out[364]:
In [367]:
train[((train.full_sq - train.life_sq) == 0)].timestamp.count()
Out[367]:
In [368]:
test[((test.full_sq - test.life_sq) == 0)].timestamp.count()
Out[368]:
In [403]:
# remove those ones as num_room doesn't make any sense here
train[(train.full_sq > 0) & (train.life_sq == 1) & (train.num_room == 1)]
Out[403]:
In [376]:
test[(test.full_sq > 0) & (test.life_sq == 1) & (test.num_room == 1)]
Out[376]:
In [ ]:
In [374]:
train = pd.read_csv("data/train.csv", index_col="id", parse_dates=["timestamp"])
test = pd.read_csv("data/test.csv", index_col="id", parse_dates=["timestamp"])
In [375]:
train.timestamp.count()
Out[375]:
In [376]:
## filtering
train = train[train.full_sq < 500]
train = train[train.price_doc < 1.111111e+08]
train = train[~((train.full_sq - train.life_sq) < 0)]
train = train[~((train.full_sq - train.kitch_sq) < 0)]
train = train[~((train.life_sq - train.kitch_sq) < 0)]
train = train[~((train.max_floor - train.floor) < 0)]
train = train[~(train.full_sq == 0)]
train = train[~((train.full_sq > 0) & (train.life_sq == 1) & (train.num_room == 1))]
train.loc[train.life_sq > train.full_sq, "life_sq"] = np.NaN
test.loc[test.life_sq > test.full_sq, "life_sq"] = np.NaN
index = [31075, 32370, 33265]
test.loc[index, "life_sq"] = test.loc[index, "full_sq"]
#index = [13120, 28737, 10371, 21418]
#train.loc[index, "build_year"] = train.loc[index, "kitch_sq"]
index = [31381]
test.loc[index, "build_year"] = test.loc[index, "kitch_sq"]
#train.loc[(train.full_sq > 210) & (train.life_sq / train.full_sq < 0.3), "full_sq"] = np.NaN
#test.loc[(test.full_sq > 150) & (test.life_sq / test.full_sq < 0.3), "full_sq"] = np.NaN
train.loc[train.life_sq > 300, ["life_sq", "full_sq"]] = np.NaN
test.loc[test.life_sq > 200, ["life_sq", "full_sq"]] = np.NaN
train.loc[train.build_year == 20052009.0, "build_year"] = 2007
train.loc[train.num_room == 0, "num_room"] = np.NaN
test.loc[test.num_room == 0, "num_room"] = np.NaN
index = [10079, 11624, 17767, 19393, 24010, 26716, 29175]
train.loc[index, "num_room"] = np.NaN
index = [33648, 37787]
test.loc[index, "num_room"] = np.NaN
train.loc[(train.floor == 0) & (train.max_floor == 0), ["max_floor", "floor"]] = np.NaN
test.loc[(test.floor == 0) & (test.max_floor == 0), ["max_floor", "floor"]] = np.NaN
train.loc[train.floor == 0, "floor"] = np.NaN
test.loc[test.floor == 0, "floor"] = np.NaN
train.loc[train.max_floor == 0, "max_floor"] = np.NaN
test.loc[test.max_floor == 0, "max_floor"] = np.NaN
train.loc[train.state == 33, "state"] = np.NaN
train.loc[train.floor > train.max_floor, "max_floor"] = np.NaN
test.loc[test.floor > test.max_floor, "max_floor"] = np.NaN
train.loc[train.full_sq == 0, "full_sq"] = np.NaN
train.loc[train.full_sq == 1, "full_sq"] = np.NaN
train.loc[train.life_sq == 0, "life_sq"] = np.NaN
train.loc[train.life_sq == 1, "life_sq"] = np.NaN
train.loc[train.kitch_sq == 0, "kitch_sq"] = np.NaN
train.loc[train.kitch_sq == 1, "kitch_sq"] = np.NaN
train.loc[train.build_year < 1000, "build_year"] = np.NaN
train.loc[train.build_year > 2030, "build_year"] = np.NaN
test.loc[test.full_sq == 0, "full_sq"] = np.NaN
test.loc[test.full_sq == 1, "full_sq"] = np.NaN
test.loc[test.life_sq == 0, "life_sq"] = np.NaN
test.loc[test.life_sq == 1, "life_sq"] = np.NaN
test.loc[test.kitch_sq == 0, "kitch_sq"] = np.NaN
test.loc[test.kitch_sq == 1, "kitch_sq"] = np.NaN
test.loc[test.kitch_sq == 1968, "kitch_sq"] = np.NaN
test.loc[test.kitch_sq == 1000, "kitch_sq"] = np.NaN
test.loc[test.build_year < 1000, "build_year"] = np.NaN
test.loc[test.build_year > 2030, "build_year"] = np.NaN
## round up test
test.full_sq = test.full_sq.round()
test.life_sq = test.life_sq.round()
In [377]:
train.timestamp.count()
Out[377]:
In [378]:
train.full_sq.describe()
Out[378]:
In [379]:
test.full_sq.describe()
Out[379]:
In [380]:
train.life_sq.describe()
Out[380]:
In [381]:
test.life_sq.describe()
Out[381]:
In [382]:
train.kitch_sq.describe()
Out[382]:
In [383]:
test.kitch_sq.describe()
Out[383]:
In [384]:
train.floor.describe()
Out[384]:
In [385]:
test.floor.describe()
Out[385]:
In [386]:
train.build_year.value_counts(dropna=False).head()
Out[386]:
In [387]:
test.build_year.value_counts(dropna=False).head()
Out[387]:
In [388]:
train.build_year.describe()
Out[388]:
In [389]:
test.build_year.describe()
Out[389]:
In [390]:
train.state.value_counts(dropna=False)
Out[390]:
In [391]:
train.max_floor.describe()
Out[391]:
In [392]:
test.max_floor.describe()
Out[392]:
In [393]:
train[train.num_room == 0].timestamp.count()
Out[393]:
In [394]:
train.to_csv("data/train_clean.csv")
test.to_csv("data/test_clean.csv")
In [ ]:
In [ ]: