In [1]:
## Necessary Imports
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
color = sns.color_palette()
In [2]:
import re
In [2]:
PATH = "kaggle\\case\\"
In [3]:
!dir {PATH}
In [4]:
df_raw = pd.read_csv(f'{PATH}training_data.csv')
In [5]:
df_raw.columns
Out[5]:
In [6]:
df_raw.info()
In [8]:
df_raw.describe(include='all')
Out[8]:
In [9]:
## Get A Quick Overview of What We Are Dealing With
sns.distplot(df_raw['resale_price']);
In [10]:
#skewness and kurtosis is Clearly Visible via this
print("Skewness: %f" % df_raw['resale_price'].skew())
print("Kurtosis: %f" % df_raw['resale_price'].kurt())
In [11]:
df_raw['resale_price'].describe()
Out[11]:
In [12]:
def disply_dtype_plot(df = None):
if df is None:
return
l = []
cols = df.columns
for i in cols:
if df[i].dtype == 'int64':
l.append('integer dtype')
elif df[i].dtype == 'object':
l.append('object dtype')
elif df[i].dtype == 'float64':
l.append('float dtype')
else:
pass
sns.countplot(l)
del l
disply_dtype_plot(df_raw)
In [9]:
df_raw.head(0)
Out[9]:
In [8]:
df_raw.drop(['Unnamed: 0','temp','building_id','no_times_resold'], inplace=True, axis=1)
In [16]:
df_raw["Age"] = df_raw['year'] - df_raw['lease_commence_date']
In [58]:
df_raw.tail(n=10)
Out[58]:
In [15]:
month_non_split = list(df_raw['month'])
In [19]:
s = re.split('-','2000-01');print(s[0]);print(s[1])
In [20]:
year = []
mon = []
for i in month_non_split:
year.append(re.split('-', i)[0])
mon.append(re.split('-', i)[1])
In [24]:
df_raw.drop(['month'],axis =1,inplace=True)
In [25]:
df_raw['month'] = mon
df_raw['year'] = year
In [26]:
df_raw.columns
Out[26]:
In [27]:
df_raw.head()
Out[27]:
In [27]:
var = 'Age'
plt.figure(figsize=(10,10))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])
Out[27]:
In [58]:
var = 'month'
plt.xticks(rotation = 45)
sns.countplot(df_raw.month)
Out[58]:
In [59]:
plt.xticks(rotation = 45)
sns.countplot(df_raw.year)
Out[59]:
In [33]:
var = 'flat_type'
plt.xticks(rotation = 45)
sns.countplot(df_raw.flat_type)
Out[33]:
In [43]:
from collections import Counter
Counter(df_raw['street_name'])
Out[43]:
In [47]:
var = 'lease_commence_date'
data = pd.concat([df_raw['resale_price'], df_raw[var]], axis=1)
data.plot.scatter(x=var, y='resale_price')
Out[47]:
In [48]:
var = 'year'
data = pd.concat([df_raw['resale_price'], df_raw[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="resale_price", data=data)
plt.xticks(rotation=90);
In [42]:
var = 'street_name'
plt.figure(figsize=(15,15))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])
Out[42]:
In [38]:
var = 'town'
plt.figure(figsize=(8,8))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])
Out[38]:
In [39]:
var = 'storey_range'
plt.xticks(rotation = 45)
sns.countplot(df_raw[var])
Out[39]:
In [44]:
var = 'flat_model'
plt.figure(figsize=(8,8))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])
Out[44]:
Here only few types are palying the role as such.. To name them they are NewGen,Improved,ModelA,Standard,Apart,Simplified,Premium,Maisonette rest we can remove them
In [64]:
train_cats(df_raw)
#it's a helper function to aotumate the boring stuffs..
# For Further Insights do a `shift+tab' or a `??train_cats`
In [65]:
df_raw.info()
In [66]:
df_raw['month'].cat.codes
Out[66]:
In [68]:
#correlation matrix
corrmat = df_raw.corr()
sns.heatmap(corrmat, vmax=.8, square=True);
In [10]:
df_raw.resale_price = np.log(df_raw.resale_price)
In [11]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
def print_score(m):
res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
m.score(X_train, y_train), m.score(X_valid, y_valid)]
if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
print(res)
return(res)
In [14]:
train_cats(df_raw)
In [15]:
df, y, _ = proc_df(df_raw, 'resale_price')
In [16]:
df.columns
Out[16]:
In [17]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)
Out[17]:
In [18]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()
n_valid = 40000
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)
X_train.shape, y_train.shape, X_valid.shape
Out[18]:
In [19]:
m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
m.score(df,y)
Out[19]:
In [20]:
draw_tree(m.estimators_[0], X_train, precision=3)
In [21]:
fi = rf_feat_importance(m, df); fi[:10]
Out[21]:
In [22]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False);
In [23]:
def plot_fi(fi):
return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)
In [24]:
plot_fi(fi[:10]);
In [25]:
from scipy.cluster import hierarchy as hc
In [26]:
corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(10,10))
dendrogram = hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=16)
plt.show()
In [28]:
from pdpbox import pdp
from plotnine import *
In [29]:
set_rf_samples(10000)
In [30]:
df_trn2, y_trn, _ = proc_df(df_raw, 'resale_price', max_n_cat=20)
X_train, X_valid = split_vals(df_trn2, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.6, n_jobs=-1)
m.fit(X_train, y_train);
In [31]:
plot_fi(rf_feat_importance(m, df_trn2)[:20]);
In [32]:
x = get_sample(X_train, 500)
In [43]:
def plot_pdp(feat, clusters=None, feat_name=None):
feat_name = feat_name or feat
p = pdp.pdp_isolate(m, x, feat)
return pdp.pdp_plot(p, feat_name, plot_lines=True,
cluster=clusters is not None, n_cluster_centers=clusters)
In [48]:
df_raw.resale_price = np.exp(df_raw.resale_price)
In [49]:
x_all = get_sample(df_raw, 500)
In [50]:
ggplot(x_all, aes('year', 'resale_price'))+stat_smooth(se=True, method='loess')
Out[50]:
In [51]:
plot_pdp('year')