In [1]:
## Necessary Imports
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics
In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
import seaborn as sns
sns.set()
In [4]:
PATH = "kaggle\\house\\"
In [5]:
!dir {PATH}
In [6]:
df_raw = pd.read_csv(f'{PATH}Train (1).csv', low_memory=False)
In [7]:
df_raw.columns
Out[7]:
In [8]:
## Get A Quick Overview of What We Are Dealing With
sns.distplot(df_raw['SalePrice']);
In [9]:
#skewness and kurtosis is Clearly Visible via this
print("Skewness: %f" % df_raw['SalePrice'].skew())
print("Kurtosis: %f" % df_raw['SalePrice'].kurt())
In [24]:
df_raw['SalePrice'].describe()
Out[24]:
In [10]:
def disply_dtype_plot(df = None):
if df is None:
return
l = []
cols = df.columns
for i in cols:
if df[i].dtype == 'int64':
l.append('integer dtype')
elif df[i].dtype == 'object':
l.append('object dtype')
elif df[i].dtype == 'float64':
l.append('float dtype')
else:
pass
sns.countplot(l)
del l
disply_dtype_plot(df_raw)
In [52]:
def print_feature(alg,printFeatureImportance=True):
if printFeatureImportance:
feat_imp = pd.Series(alg.feature_importances_, predictors = df.columns).sort_values(ascending=False)
plt.figure(figsize=(20,20))
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
In any sort of data science work, it's important to look at your data, to make sure you understand the format, how it's stored, what type of values it holds, etc. Even if you've read descriptions about your data, the actual data may not be what you expect
In [11]:
def display_all(df):
with pd.option_context("display.max_rows", 1000):
with pd.option_context("display.max_columns", 1000):
display(df)
In [12]:
display_all(df_raw.tail())
A lot of columns are here in this dataset, So a lot of FUN....
In [13]:
display_all(df_raw.describe())
In [14]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df_raw.drop('SalePrice', axis=1), df_raw.SalePrice)
Wait Our Model Just Failed Badly....
This dataset contains a mix of continuous and categorical variables.
The categorical variables are currently stored as strings, which is inefficient, and doesn't provide the numeric coding required for a random forest. Therefore we call train_cats to convert strings to pandas categories
In [17]:
train_cats(df_raw)
#it's a helper function to aotumate the boring stuffs..
# For Further Insights do a `shift+tab' or a `??train_cats`
In [18]:
df_raw.info()
In [19]:
df_raw.MSZoning
Out[19]:
In [20]:
df_raw.MSZoning.cat.categories
Out[20]:
In [21]:
df_raw.MSZoning.cat.codes
Out[21]:
In [22]:
total = df_raw.isnull().sum().sort_values(ascending=False)
percent = (df_raw.isnull().sum()/df_raw.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)
Out[22]:
In [23]:
df_raw.SalePrice
Out[23]:
In [25]:
#Taking Logs as MSE Doesn't looks good
df_raw.SalePrice = np.log(df_raw.SalePrice)
In [26]:
df_raw.SalePrice
Out[26]:
In [27]:
sns.distplot(df_raw.SalePrice)
Out[27]:
So there is a lot of emptyness in some of the columns< So we are dropping them as of now to make analysis easier
In [28]:
os.makedirs('tmp', exist_ok=True)
df_raw.to_feather('tmp/house-raw')
In [29]:
df, y, nas, mapper = proc_df(df_raw, 'SalePrice', do_scale=True)
In [30]:
df.columns
Out[30]:
In [31]:
nas
Out[31]:
In [32]:
mapper
Out[32]:
We now have something we can pass to a random forest! Huraaahhh!!!!
In [33]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)
Out[33]:
In [34]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()
n_valid = 50
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)
X_train.shape, y_train.shape, X_valid.shape
Out[34]:
Base model
Let's try our model again, this time with separate training and validation sets.
In [35]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
def print_score(m):
res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
m.score(X_train, y_train), m.score(X_valid, y_valid)]
if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
print(res)
return(res)
In [36]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
res = print_score(m)
2nd is root mean square logloss on Prices, 3rd is r^2 of Training set, 4th is r^2 of Validation Set
In [37]:
res
Out[37]:
In [38]:
def display_score(res):
print('Train Loss:{:.9f}, LogLoss on y {:.9f}, R2 Train {:.9f}, R2 Valid {:.9f}'.format(res[0], res[1], res[2],res[3]))
display_score(res)
In [39]:
m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
res = print_score(m)
display_score(res)
In [40]:
draw_tree(m.estimators_[0], X_train, precision=3)
In [41]:
fi = rf_feat_importance(m, df); fi[:10]
Out[41]:
Seems Like Only few of the columns drive the predictions..
In [42]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False);
In [43]:
def plot_fi(fi):
return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)
In [44]:
plot_fi(fi[:10]);
In [53]:
#scatter plot BsmtFinSF!/saleprice
var = 'BsmtFinSF1'
data = pd.concat([np.exp(df_raw['SalePrice']), df_raw[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
In [49]:
#scatter plot grlivarea/saleprice
var = 'GrLivArea'
data = pd.concat([np.exp(df_raw['SalePrice']), df_raw[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
In [50]:
var = 'YearBuilt'
data = pd.concat([np.exp(df_raw['SalePrice']), df_raw[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=90);
In [51]:
#box plot overallqual/saleprice
var = 'OverallQual'
data = pd.concat([np.exp(df_raw['SalePrice']), df_raw[var]], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=var, y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
In [52]:
#scatter plot totalbsmtsf/saleprice
var = 'TotalBsmtSF'
data = pd.concat([np.exp(df_raw['SalePrice']), df_raw[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0,800000));
In [54]:
#correlation matrix
corrmat = df_raw.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);
In [55]:
#scatterplot
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(df_raw[cols], size = 2.5)
plt.show();
In [56]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5,
n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
_ = print_score(m)
In [57]:
from scipy.cluster import hierarchy as hc
In [58]:
corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(20,20))
dendrogram = hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=16)
plt.show()