In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
%matplotlib inline
In [2]:
# Loading training data
training = pd.read_csv("data/kaggle-house-prices/train.csv")
training.info()
In [3]:
training.sample(10)
Out[3]:
In [4]:
SalesPrice = training.SalePrice
In [5]:
# Loading testing data
testing = pd.read_csv("data/kaggle-house-prices/test.csv")
testing.info()
In [6]:
training.shape, testing.shape
Out[6]:
In [7]:
# Combining training and testing data into a single dataframe
df = pd.concat([training.iloc[:, 0:80], testing], ignore_index = True)
df.info()
In [8]:
print("Training: ", training.shape)
print("Testing: ", testing.shape)
print("Combined: ", df.shape)
Loaded both datasets and combined them into a single dataset for data preprocessing. When we are ready for modeling prepearion, we will split them into 2 datasets.
Amount of NA values
In [9]:
df.isnull().sum().sum() / (df.shape[0] * df.shape[1])
Out[9]:
Close 6% of data points across entire dataset is NA. Find out which columns have null in them
In [10]:
def find_null_columns():
null_columns = df.isnull().sum().sort_values(ascending = False)
null_columns = null_columns[null_columns > 0]
print("Total number of records: ", df.shape[0])
print("No of columns with null values: ", len(null_columns))
print("\nMissing values per column:")
print(null_columns)
find_null_columns()
In [11]:
def find_columns(df, search):
matched = [c for c in df.columns if search.lower() in c.lower()]
return matched
find_columns(df, "pool")
Out[11]:
From the field types described above, we can see PoolArea is numeric and PoolQC is categorical. Find out where are any records where PoolArea is mentioned but PoolQC is null.
In [12]:
df.query("PoolArea >= 0 and PoolQC != PoolQC").shape
Out[12]:
Out of these, how many have >0 PoolArea?
In [13]:
df.query("PoolArea > 0 and PoolQC != PoolQC")[['PoolArea', 'PoolQC']]
Out[13]:
We can impute the values of PoolQC, by looking at PoolArea value. Find out how many diff values are there for PoolQC and their freq and mean PoolArea
In [14]:
df.groupby("PoolQC").PoolArea.agg([np.mean, len])
Out[14]:
So, we can input Ex, Ex, Fa respectively so that PoolArea values are closer the mean values for the respective PoolQC. Replace the rest of NA values under PoolQC with "None".
In [15]:
df.loc[2420,'PoolQC'] = "Ex"
df.loc[2503,'PoolQC'] = "Ex"
df.loc[2599,'PoolQC'] = "Fa"
idx = df.query("PoolArea == 0 and PoolQC != PoolQC").index
df.loc[idx, 'PoolQC'] = "None"
df.query("PoolArea >= 0 and PoolQC != PoolQC").shape
Out[15]:
In [16]:
garage_cols = find_columns(df, "garage")
df[garage_cols].info()
Lets see what the deal is with GarageYrBlt. It seems reasonable that most houses would build a garage when the house itself was built. We can check this by seeing how many houses were built the same year their garage was built.
In [17]:
(df.GarageYrBlt == df.YearBuilt).value_counts()
Out[17]:
2216 of the 2919 houses have same year for for GarageYrBlt and YearBuilt. Lets replace any of the NA’s for GarageYrBlt with the year from YearBuilt.
In [18]:
df.query("GarageYrBlt != GarageYrBlt").shape
Out[18]:
In [19]:
idx = df.query("GarageYrBlt != GarageYrBlt").index
df.loc[idx, "GarageYrBlt"] = df.loc[idx, "YearBuilt"]
(df.GarageYrBlt == df.YearBuilt).value_counts()
Out[19]:
In [20]:
pd.isnull(df.GarageYrBlt).value_counts()
Out[20]:
So, there is no more NA in the GarageYrBlt field.
Let's look at top 10 values of GarageYrBlt and top 10 YearBuilt values to to find any anamolies
In [21]:
print(df.GarageYrBlt.sort_values(ascending = False)[:10])
print(df.YearBuilt.sort_values(ascending = False)[:10])
In [22]:
df[df.GarageYrBlt == 2207][["GarageYrBlt", "YearBuilt"]]
Out[22]:
Replace GarageYrBlt with YearBuild for record 2593
In [23]:
df.loc[2592, "GarageYrBlt"] = 2006
Plot histogram of GarageYrBlt
In [24]:
df.GarageYrBlt.plot.hist(bins = 100)
plt.title("Histogram by GarageYrBlt")
plt.xlabel("GarageYrBlt")
Out[24]:
Plot histogram of YearBuilt
In [25]:
df.YearBuilt.plot.hist(bins = 100)
plt.title("Histogram by YearBuilt")
plt.xlabel("YearBuilt")
Out[25]:
Let's see values at lower end to find any anamolies
In [26]:
print(df.GarageYrBlt.sort_values(ascending = True)[:10])
print(df.YearBuilt.sort_values(ascending = True)[:10])
At the lower values there is no obvious anamoly.
That leaves 6 garage features in our dataset and 4 of them have at least 157 missing values while GarageArea and GarageCars both only have 1, thus we can assume this particular house does not have a garage at all. For the rest of the houses we can check to see that if the NA’s recorded also have 0 GarageArea and 0 GarageCars. If they do we can fill in their missing values with ‘None’ since having 0 area and 0 cars in their garage will imply that they do not have any at all.
In [27]:
df[df.GarageCond.isnull()][garage_cols].query("GarageArea > 0")
Out[27]:
Only one house who had NA’s in their garage columns had an area graeteer than 0. We can fill this house in manually and set the rest of the houses NA’s to 0.
For the house with GarageArea = 360 and GarageCars = 1, but NA’s in the other columns, we can use the most frequent values for each columns from houses with a similar area and car count.
In [28]:
idx = df.query("GarageArea >= 350 and GarageArea <= 370").index
df2 = df.loc[idx, garage_cols]
df2
Out[28]:
Find most common values for each column
In [29]:
def mostFrequent(col):
counts = col.value_counts()
counts = counts.sort_values(ascending = False)
return counts.index[0]
df2.apply(mostFrequent, axis = 0)
Out[29]:
In [30]:
df.loc[2126, "GarageQual"] = "TA"
df.loc[2126, "GarageFinish"] = "Unf"
df.loc[2126, "GarageCond"] = "TA"
Now we can fill in any missing numeric values with 0 and categoric with ‘None’ since these houses recorded having 0 area and 0 cars in their garage.
In [31]:
pd.isnull(df[garage_cols]).sum()
Out[31]:
In [32]:
garage_cols
Out[32]:
In [33]:
def replace_with(df, col, fill_with):
idx = df[df[col].isnull()].index
df.loc[idx, col] = fill_with
for c in ["GarageType", "GarageFinish", "GarageQual", "GarageCond"]:
replace_with(df, c, "None")
for c in ["GarageCars", "GarageArea"]:
replace_with(df, c, 0.0)
pd.isnull(df[garage_cols]).sum()
Out[33]:
In [34]:
df[find_columns(df, "kitchen")].info()
In [35]:
df.KitchenQual.value_counts()
Out[35]:
In [36]:
replace_with(df, "KitchenQual", "TA")
In [37]:
df[find_columns(df, "electrical")].info()
In [38]:
df.Electrical.value_counts()
Out[38]:
In [39]:
replace_with(df, "Electrical", "SBrkr")
df[find_columns(df, "electrical")].info()
In [40]:
basement_columns = find_columns(df, "bsmt")
df[basement_columns].info()
From the documetation below are the description of these fields.
There are 11 basement features each with at least 1 missing value. We can take a look at the subset of just these columns from our data.
In [41]:
df[df.BsmtExposure.isnull()][basement_columns]
Out[41]:
In [42]:
df.BsmtExposure.value_counts()
Out[42]:
Almost all of the missing values for each categoric basement feature comes from houses with 0 on each features corresponding to area. We can fill in these values with ‘None’ since these houses certainly don’t have basements. Rows 949, 1488 and 2349 are the only missing values from BsmtExposure, we can fill this with No as that is the most frequent value and these houses most likely don’t have any exposure for their basements. The rest of the basement columns corresponding to area will be filled with 0 since they likely don’t have a basement and the categoric missing values will be filled with NoBsmt.
In [43]:
df.loc[[949, 1488, 2349], 'BsmtExposure'] = 'No'
In [44]:
df[basement_columns].info()
In [45]:
for c in ["BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"]:
replace_with(df, c, "None")
for c in ["BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
"TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath"]:
replace_with(df, c, 0.0)
df[basement_columns].info()
In [46]:
find_null_columns()
In [47]:
exterior_columns = find_columns(df, "exterior")
exterior_columns
Out[47]:
In [48]:
df.query("Exterior1st != Exterior1st or Exterior2nd != Exterior2nd")[exterior_columns]
Out[48]:
In [49]:
df.loc[2151, "Exterior1st"] = "Other"
df.loc[2151, "Exterior2nd"] = "Other"
In [50]:
find_null_columns()
In [51]:
sale_columns = find_columns(df, "sale")
sale_columns
Out[51]:
In [52]:
df[df.SaleType.isnull()][sale_columns]
Out[52]:
In [53]:
df.groupby(["SaleType", "SaleCondition"]).Id.count().unstack()
Out[53]:
In [54]:
df.loc[2489, "SaleType"] = "WD"
find_null_columns()
In [55]:
functional_columns = find_columns(df, "func")
functional_columns
Out[55]:
In [56]:
df.Functional.value_counts()
Out[56]:
In [57]:
replace_with(df, "Functional", "Typ")
In [58]:
find_null_columns()
In [59]:
df.Utilities.value_counts()
Out[59]:
In [60]:
del df["Utilities"]
find_null_columns()
In [ ]:
In [61]:
zoning_columns = ["MSZoning", "MSSubClass"]
df[df.MSZoning.isnull()][zoning_columns]
Out[61]:
Find the most frequent corresponding subclass
In [62]:
df.groupby(["MSSubClass", "MSZoning"]).Id.count().unstack()
Out[62]:
In [63]:
df.loc[1915, "MSZoning"] = "RM"
df.loc[2216, "MSZoning"] = "RL"
df.loc[2250, "MSZoning"] = "RM"
df.loc[2904, "MSZoning"] = "RL"
In [64]:
find_null_columns()
In [65]:
masonary_columns = find_columns(df, "mas")
masonary_columns
Out[65]:
In [66]:
df[df.MasVnrType.isnull() | df.MasVnrArea.isnull()][masonary_columns]
Out[66]:
All but one house has missing values for both columns. For houses with NA’s on both columns we can fill 0 for the area and None for the type since they likely do not have a masonry veneer. For the house with a MasVnrArea of 198 but NA for MasVnrType we can record the median areas for each type and see which type is closest to 198.
In [67]:
df.groupby("MasVnrType").MasVnrArea.agg([len, np.median])
Out[67]:
In [68]:
df.loc[2610, "MasVnrType"] = "Stone"
replace_with(df, "MasVnrType", "None")
replace_with(df, "MasVnrArea", 0)
find_null_columns()
There are 486 missing values for LotFrontage, which is quite a lot of values to fill and we can’t just replace these with 0. We’re given that “LotFrontage: Linear feet of street connected to property.” The area of each street connected to the house property is most likely going to have a similar area to other houses in its neighborhood. We can group by each neighborhood and take the median of each LotFrontage and fill the missing values of each LotFrontage based on what neighborhood the house comes from.
In [69]:
lot_columns = find_columns(df, "lot")
lot_columns.append("Neighborhood")
lot_columns
Out[69]:
In [70]:
neighborhood = df.groupby("Neighborhood").LotFrontage.agg([len, np.median])
neighborhood
Out[70]:
In [71]:
df2 = df[lot_columns].merge(neighborhood,
left_on="Neighborhood", right_index=True)
df2
Out[71]:
In [72]:
idx = df[df.LotFrontage.isnull()].index
df.loc[idx, "LotFrontage"] = df2.loc[idx, "median"]
df[lot_columns].merge(neighborhood,
left_on="Neighborhood", right_index=True)
Out[72]:
In [73]:
find_null_columns()
In [74]:
df.Fence.value_counts()
Out[74]:
In [75]:
replace_with(df, "Fence", "None")
In [76]:
find_null_columns()
In [77]:
replace_with(df, "MiscFeature", "None")
find_null_columns()
In [78]:
fire_columns = find_columns(df, "fire")
fire_columns
Out[78]:
In [79]:
df.groupby(["FireplaceQu", "Fireplaces"]).Id.count().unstack()
Out[79]:
In [80]:
df[df.FireplaceQu.isnull()][fire_columns]
Out[80]:
All the houses that have missing values did not record having any fireplaces. We can replace the NA’s with ‘None’ since these houses don’t have any fireplaces at all.
In [81]:
replace_with(df, "FireplaceQu", "None")
In [82]:
find_null_columns()
In [83]:
replace_with(df, "Alley", "None")
find_null_columns()
Let's add back the SalePrice column to the dataset. Note for the testing data, there is not SalePrice. So for those, the field will be null.
In [84]:
df.head()
Out[84]:
In [85]:
df["SalesPrice"] = np.nan
df.loc[0:training.shape[0], "SalesPrice"] = SalesPrice
df.head()
Out[85]:
In [86]:
df.to_csv("data/kaggle-house-prices/data_combined_cleaned.csv", index = False)
In [87]:
xlims = training.SalePrice.min(), training.SalePrice.max()
plt.subplot(2, 1, 1)
training.SalePrice.plot.hist(bins = 50, xlim = xlims, sharex = True, title = "Histogram of SalePrice")
plt.subplot(2, 1, 2)
training.SalePrice.plot.box(vert = False, xlim = xlims, sharex = True, title = "Boxplot of SalePrice")
plt.tight_layout()
plt.xlabel("SalePrice")
Out[87]:
In [88]:
training.YearBuilt.plot.hist(bins = 50, title = "Frequency by YearBuilt")
Out[88]:
In [89]:
grouped = training.groupby("YearBuilt").SalePrice.median().sort_index()
plt.figure(figsize=(15, 5))
plt.bar(grouped.index, grouped)
plt.xlabel("YearBuilt")
plt.ylabel("Median SalePrice")
plt.title("Median SalePrice by YearBuilt")
Out[89]:
In [90]:
training.plot.scatter("GarageArea", "SalePrice")
plt.title("SalePrice by GarageArea")
Out[90]:
In [91]:
import seaborn as sns
In [92]:
grouped = training.groupby(["SaleCondition","BldgType"]).SalePrice.median().unstack()
blues = sns.color_palette("Blues", n_colors=10)
sns.heatmap(grouped, cmap = blues)
plt.title("Median Saleprice \n by SaleCondition and BldgType")
Out[92]:
In [ ]: