In [1]:
# A logistic regression learning algorithm example using TensorFlow library.
# Author: Alaa Awad
# Project: https://www.kaggle.com/c/house-prices-advanced-regression-techniques
In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from scipy.stats import skew
from scipy.stats.stats import pearsonr
%config InlineBackend.figure_format = 'png' #set 'png' here when working on notebook
%matplotlib inline
In [3]:
train = pd.read_csv('../input/train.csv.gz')
test = pd.read_csv('../input/test.csv.gz')
In [4]:
train.head()
Out[4]:
In [7]:
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))
labels = train['SalePrice']
In [8]:
all_data.head()
Out[8]:
In [8]:
all_data.describe()
Out[8]:
In [10]:
all_data.columns
Out[10]:
In [11]:
print("Some housing prices statistics\n")
print(train['SalePrice'].describe())
print("\nThe median of the Housing Price is: ", train['SalePrice'].median(axis = 0))
We're not going to do anything fancy here:
In [15]:
matplotlib.rcParams['figure.figsize'] = (12.0, 6.0)
prices = pd.DataFrame({"price":train["SalePrice"], "log(price + 1)":np.log1p(train["SalePrice"])})
prices.hist()
Out[15]:
In [24]:
sns.distplot(train["SalePrice"], bins=30, kde = False, color = 'b', hist_kws={'alpha': 0.9})
Out[24]:
In [22]:
sns.distplot(np.log1p(train["SalePrice"]), bins=30, kde = False, color = 'b', hist_kws={'alpha': 0.9})
Out[22]:
In [25]:
corr = train.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
plt.figure(figsize=(12, 12))
sns.heatmap(corr, vmax=1, square=True)
Out[25]:
In [28]:
cor_dict = corr['SalePrice'].to_dict()
del cor_dict['SalePrice']
print("List the numerical features decendingly by their correlation with Sale Price:\n")
for ele in sorted(cor_dict.items(), key = lambda x: -abs(x[1])):
print("{0}: \t{1}".format(*ele))
The housing price correlates strongly with `OverallQual, GrLivArea(GarageCars), GargeArea, TotalBsmtSF, 1stFlrSF, FullBath, TotRmsAbvGrd, YearBuilt, YearRemodAdd, GargeYrBlt, MasVnrArea` and `Fireplaces`. But some of those features are highly correlated among each others.
In [30]:
sns.regplot(x = 'OverallQual', y = 'SalePrice', data = train, color = 'Orange')
Out[30]:
In [36]:
plt.figure(1)
f, axarr = plt.subplots(3, 3, figsize=(10, 9))
price = train.SalePrice.values
axarr[0, 0].scatter(train.GrLivArea.values, price)
axarr[0, 0].set_title('GrLiveArea')
axarr[0, 1].scatter(train.GarageArea.values, price)
axarr[0, 1].set_title('GarageArea')
axarr[0, 2].scatter(train.GarageCars.values, price)
axarr[0, 2].set_title('GarageCars')
axarr[1, 0].scatter(train.TotalBsmtSF.values, price)
axarr[1, 0].set_title('TotalBsmtSF')
axarr[1, 1].scatter(train['1stFlrSF'].values, price)
axarr[1, 1].set_title('1stFlrSF')
axarr[1, 2].scatter(train.OverallQual.values, price)
axarr[1, 2].set_title('OverallQual')
axarr[2, 0].scatter(train.TotRmsAbvGrd.values, price)
axarr[2, 0].set_title('TotRmsAbvGrd')
axarr[2, 1].scatter(train.MasVnrArea.values, price)
axarr[2, 1].set_title('MasVnrArea')
axarr[2, 2].scatter(train.YearBuilt.values, price)
axarr[2, 2].set_title('YearBuilt')
f.text(-0.01, 0.5, 'Sale Price', va='center', rotation='vertical', fontsize = 12)
plt.tight_layout()
plt.show()
In [38]:
fig = plt.figure(2, figsize=(9, 7))
plt.subplot(211)
plt.scatter(train.YearRemodAdd.values, price)
plt.title('YearRemodAdd')
plt.subplot(212)
plt.scatter(train.YearBuilt.values, price)
plt.title('YearBuilt')
fig.text(-0.01, 0.5, 'Sale Price', va = 'center', rotation = 'vertical', fontsize = 12)
plt.tight_layout()
In [40]:
print(train.select_dtypes(include=['object']).columns.values)
In [41]:
plt.figure(figsize = (12, 6))
sns.boxplot(x = 'Neighborhood', y = 'SalePrice', data = train)
xt = plt.xticks(rotation=45)
In [42]:
plt.figure(figsize = (12, 6))
sns.countplot(x = 'Neighborhood', data = train)
xt = plt.xticks(rotation=45)
Could group those Neighborhoods with similar housing price into a same bucket for dimension-reduction.
Housing Price vs Sales
In [44]:
fig, ax = plt.subplots(1, 2, figsize = (10, 6))
sns.boxplot(x = 'SaleType', y = 'SalePrice', data = train, ax = ax[0])
sns.boxplot(x = 'SaleCondition', y = 'SalePrice', data = train, ax = ax[1])
plt.tight_layout()
In [47]:
g = sns.FacetGrid(train, col = 'YrSold', col_wrap = 3)
g.map(sns.boxplot, 'MoSold', 'SalePrice', palette='Set2', order = range(1, 13))\
.set(ylim = (0, 500000))
plt.tight_layout()
Sale's timing does not seem to hugely affect the house.
In [49]:
fig, ax = plt.subplots(2, 1, figsize = (10, 8))
sns.boxplot(x = 'BldgType', y = 'SalePrice', data = train, ax = ax[0])
sns.boxplot(x = 'HouseStyle', y = 'SalePrice', data = train, ax = ax[1])
Out[49]:
In [51]:
fig, ax = plt.subplots(2, 1, figsize = (10, 8))
sns.boxplot(x = 'Condition1', y = 'SalePrice', data = train, ax = ax[0])
sns.boxplot(x = 'Exterior1st', y = 'SalePrice', data = train, ax = ax[1])
x = plt.xticks(rotation = 45)
plt.show()
In [53]:
fig, ax = plt.subplots(2, 2, figsize = (10, 8))
sns.boxplot('BsmtCond', 'SalePrice', data = train, ax = ax[0, 0])
sns.boxplot('BsmtQual', 'SalePrice', data = train, ax = ax[0, 1])
sns.boxplot('BsmtExposure', 'SalePrice', data = train, ax = ax[1, 0])
sns.boxplot('BsmtFinType1', 'SalePrice', data = train, ax = ax[1, 1])
Out[53]:
In [56]:
sns.violinplot('Functional', 'SalePrice', data = train)
Out[56]:
In [63]:
sns.factorplot('FireplaceQu', 'SalePrice', data = train, color = 'm', \
estimator = np.median, order = ['Ex', 'Gd', 'TA', 'Fa', 'Po'], size = 4.5, aspect=1.35)
Out[63]:
In [65]:
pd.crosstab(train.Fireplaces, train.FireplaceQu)
Out[65]:
In [67]:
g = sns.FacetGrid(train, col = 'FireplaceQu', col_wrap = 3, col_order=['Ex', 'Gd', 'TA', 'Fa', 'Po'])
g.map(sns.boxplot, 'Fireplaces', 'SalePrice', order = [1, 2, 3], palette = 'Set2')
Out[67]:
Ames is a cold place in winter, so heating (as well as fireplace qualities) are quite important.
In [69]:
pd.crosstab(train.HeatingQC, train.CentralAir)
Out[69]:
In [70]:
pd.crosstab(train.HeatingQC, train.FireplaceQu)
Out[70]:
In [73]:
sns.factorplot('HeatingQC', 'SalePrice', hue = 'CentralAir', estimator = np.mean, data = train,
size = 4.5, aspect = 1.4)
Out[73]:
In [75]:
fig, ax = plt.subplots(1, 2, figsize = (10, 4))
sns.boxplot('Electrical', 'SalePrice', data = train, ax = ax[0]).set(ylim = (0, 400000))
sns.countplot('Electrical', data = train)
plt.tight_layout()
In [77]:
sns.factorplot('KitchenQual', 'SalePrice', estimator = np.mean,
size = 4.5, aspect = 1.4, data = train, order = ['Ex', 'Gd', 'TA', 'Fa'])
Out[77]:
In [79]:
sns.boxplot(x = 'MSZoning', y = 'SalePrice', data = train)
Out[79]:
In [82]:
fig, ax = plt.subplots(1, 2, figsize = (10, 4))
sns.boxplot(x = 'Street', y = 'SalePrice', data = train, ax = ax[0], order=['Grvl','Pave'])
sns.boxplot(x = 'Alley', y = 'SalePrice', data = train, ax = ax[1], order=['Grvl','Pave'])
plt.tight_layout()
In [85]:
print("The NA's in Alley is: ", train['Alley'].isnull().sum())
print("\nThere are so many NA's in Alley. When Alley is NA, Street = ",
train[train.Alley.notnull()].Street.unique())
print("\n", pd.crosstab(train.Street, train.Alley))
In [ ]: