In [1]:
## Necessary Imports
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
color = sns.color_palette()
In [2]:
import re
In [2]:
PATH = "kaggle\\case\\"
In [3]:
!dir {PATH}
In [4]:
df_raw = pd.read_csv(f'{PATH}training_data.csv')
In [5]:
df_raw.columns
Out[5]:
In [6]:
df_raw.info()
In [8]:
df_raw.describe(include='all')
Out[8]:
In [9]:
## Get A Quick Overview of What We Are Dealing With
sns.distplot(df_raw['resale_price']);
In [10]:
#skewness and kurtosis is Clearly Visible via this
print("Skewness: %f" % df_raw['resale_price'].skew())
print("Kurtosis: %f" % df_raw['resale_price'].kurt())
In [11]:
df_raw['resale_price'].describe()
Out[11]:
In [12]:
def disply_dtype_plot(df = None):
if df is None:
return
l = []
cols = df.columns
for i in cols:
if df[i].dtype == 'int64':
l.append('integer dtype')
elif df[i].dtype == 'object':
l.append('object dtype')
elif df[i].dtype == 'float64':
l.append('float dtype')
else:
pass
sns.countplot(l)
del l
disply_dtype_plot(df_raw)
In [9]:
df_raw.head(0)
Out[9]:
In [8]:
df_raw.drop(['Unnamed: 0','temp','building_id','no_times_resold'], inplace=True, axis=1)
In [16]:
df_raw["Age"] = df_raw['year'] - df_raw['lease_commence_date']
In [58]:
df_raw.tail(n=10)
Out[58]:
In [15]:
month_non_split = list(df_raw['month'])
In [19]:
s = re.split('-','2000-01');print(s[0]);print(s[1])
In [20]:
year = []
mon = []
for i in month_non_split:
year.append(re.split('-', i)[0])
mon.append(re.split('-', i)[1])
In [24]:
df_raw.drop(['month'],axis =1,inplace=True)
In [25]:
df_raw['month'] = mon
df_raw['year'] = year
In [26]:
df_raw.columns
Out[26]:
In [27]:
df_raw.head()
Out[27]:
In [27]:
var = 'Age'
plt.figure(figsize=(10,10))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])
Out[27]:
In [58]:
var = 'month'
plt.xticks(rotation = 45)
sns.countplot(df_raw.month)
Out[58]:
In [59]:
plt.xticks(rotation = 45)
sns.countplot(df_raw.year)
Out[59]:
In [33]:
var = 'flat_type'
plt.xticks(rotation = 45)
sns.countplot(df_raw.flat_type)
Out[33]:
In [43]:
from collections import Counter
Counter(df_raw['street_name'])
Out[43]:
In [47]:
var = 'lease_commence_date'
data = pd.concat([df_raw['resale_price'], df_raw[var]], axis=1)
data.plot.scatter(x=var, y='resale_price')
Out[47]:
In [48]:
var = 'year'
data = pd.concat([df_raw['resale_price'], df_raw[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="resale_price", data=data)
plt.xticks(rotation=90);
In [42]:
var = 'street_name'
plt.figure(figsize=(15,15))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])
Out[42]:
In [38]:
var = 'town'
plt.figure(figsize=(8,8))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])
Out[38]:
In [39]:
var = 'storey_range'
plt.xticks(rotation = 45)
sns.countplot(df_raw[var])
Out[39]:
In [44]:
var = 'flat_model'
plt.figure(figsize=(8,8))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])
Out[44]:
Here only few types are palying the role as such.. To name them they are NewGen,Improved,ModelA,Standard,Apart,Simplified,Premium,Maisonette rest we can remove them
In [64]:
train_cats(df_raw)
#it's a helper function to aotumate the boring stuffs..
# For Further Insights do a `shift+tab' or a `??train_cats`
In [65]:
df_raw.info()
In [66]:
df_raw['month'].cat.codes
Out[66]:
In [68]:
#correlation matrix
corrmat = df_raw.corr()
sns.heatmap(corrmat, vmax=.8, square=True);
In [10]:
df_raw.resale_price = np.log(df_raw.resale_price)
In [11]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())
def print_score(m):
res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
m.score(X_train, y_train), m.score(X_valid, y_valid)]
if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
print(res)
return(res)
In [14]:
train_cats(df_raw)
In [15]:
df, y, _ = proc_df(df_raw, 'resale_price')
In [16]:
df.columns
Out[16]:
In [17]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)
Out[17]:
In [18]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()
n_valid = 40000
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)
X_train.shape, y_train.shape, X_valid.shape
Out[18]:
In [19]:
m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
m.score(df,y)
Out[19]:
In [20]:
draw_tree(m.estimators_[0], X_train, precision=3)
In [21]:
fi = rf_feat_importance(m, df); fi[:10]
Out[21]:
In [22]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False);
In [23]:
def plot_fi(fi):
return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)
In [24]:
plot_fi(fi[:10]);
In [25]:
from scipy.cluster import hierarchy as hc
In [26]:
corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(10,10))
dendrogram = hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=16)
plt.show()
In [28]:
from pdpbox import pdp
from plotnine import *
In [29]:
set_rf_samples(10000)
In [30]:
df_trn2, y_trn, _ = proc_df(df_raw, 'resale_price', max_n_cat=20)
X_train, X_valid = split_vals(df_trn2, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.6, n_jobs=-1)
m.fit(X_train, y_train);
In [31]:
plot_fi(rf_feat_importance(m, df_trn2)[:20]);
In [32]:
x = get_sample(X_train, 500)
In [43]:
def plot_pdp(feat, clusters=None, feat_name=None):
feat_name = feat_name or feat
p = pdp.pdp_isolate(m, x, feat)
return pdp.pdp_plot(p, feat_name, plot_lines=True,
cluster=clusters is not None, n_cluster_centers=clusters)
In [48]:
df_raw.resale_price = np.exp(df_raw.resale_price)
In [49]:
x_all = get_sample(df_raw, 500)
In [50]:
ggplot(x_all, aes('year', 'resale_price'))+stat_smooth(se=True, method='loess')
Out[50]:
In [51]:
plot_pdp('year')
In [53]:
plot_pdp('year', clusters=5)
In [52]:
plot_pdp('floor_area_sqm', clusters=5)
In [120]:
feats = ['year','lease_commence_date']
p = pdp.pdp_interact(m, x, feats)
pdp.pdp_interact_plot(p, feats)
In [121]:
from treeinterpreter import treeinterpreter as ti
In [124]:
fi = rf_feat_importance(m, x); fi[:15]
Out[124]:
In [125]:
feats=['flat_type_3 ROOM','flat_type_4 ROOM']
In [126]:
(X_train[feats]/1000).describe()
Out[126]:
In [127]:
(X_valid[feats]/1000).describe()
Out[127]:
In [128]:
x.drop(feats, axis=1, inplace=True)
In [37]:
from collections import Counter
z = Counter(df_raw['street_name'])
In [42]:
for k, v in z.items():
print("%s: %s" % (k, v))
In [43]:
from collections import OrderedDict
z_sorted_by_value = OrderedDict(sorted(z.items(), key=lambda x: x[1], reverse= True))
In [77]:
df_count_street = pd.DataFrame(z_sorted_by_value, index = ['freq']);df_count_street = df_count_street.T;df_count_street
Out[77]:
In [28]:
df_raw
Out[28]:
In [18]:
new_df = pd.read_csv(f'{PATH}train.csv')
In [19]:
new_df.head(1)
Out[19]:
In [20]:
new_df.info()
In [20]:
new_df.head(1)
Out[20]:
In [21]:
plt.figure(figsize=(8,6))
plt.scatter(range(new_df.shape[0]), np.sort(new_df.resale_price.values))
plt.xlabel('index', fontsize=12)
plt.ylabel('price', fontsize=12)
##if needed, one can truncate the high values.
Out[21]:
In [22]:
plt.figure(figsize=(12,8))
sns.distplot(new_df.resale_price.values, bins=50, kde=True)
plt.xlabel('price', fontsize=12)
plt.show()
#Certainly a long right tail.let us plot the log of resale_price variable.
In [23]:
plt.figure(figsize=(12,8))
sns.distplot(np.log(new_df.resale_price.values), bins=50, kde=True)
plt.xlabel('price', fontsize=12)
Out[23]:
Now let us see how the median housing price change with time.
In [24]:
grouped_df = new_df.groupby('year')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(8,8))
sns.barplot(grouped_df.year.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Year', fontsize=12)
plt.xticks(rotation=45)
Out[24]:
In [25]:
grouped_df = new_df.groupby('month')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(12,8))
sns.barplot(grouped_df.month.values, grouped_df.resale_price.values, alpha=0.8, color=color[5])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Month', fontsize=12)
plt.xticks(rotation='vertical')
Out[25]:
In [26]:
grouped_df = new_df.groupby('no_times_resold')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(15,15))
sns.barplot(grouped_df.no_times_resold.values, grouped_df.resale_price.values, alpha=0.8, color=color[4])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('no_times_resold', fontsize=12)
plt.xticks(rotation='vertical')
Out[26]:
In [27]:
plt.figure(figsize=(25,25))
plt.xlabel('floor_area_sqm', fontsize=16)
plt.ylabel('resale_price', fontsize=16)
new_df.groupby('floor_area_sqm')['resale_price'].mean().plot(kind='area')
Out[27]:
In [28]:
plt.figure(figsize=(10,10))
plt.xticks(rotation=45)
new_df.groupby('lease_commence_date')['resale_price'].mean().plot(kind='bar')
Out[28]:
In [29]:
plt.figure(figsize=(12,12))
plt.xticks(rotation = 'vertical')
sns.boxplot(x="no_times_resold", y="age", data=new_df)
Out[29]:
In [30]:
plt.figure(figsize=(12,12))
plt.xticks(rotation = 'vertical')
sns.boxplot(x="no_times_resold", y="resale_price", data=new_df)
Out[30]:
In [39]:
figbi, axesbi = plt.subplots(2, 4, figsize=(25, 25))
new_df.groupby('floor_area_sqm')['resale_price'].mean().plot(kind='barh',ax=axesbi[0,0])
new_df.groupby('lease_commence_date')['resale_price'].mean().plot(kind='barh',ax=axesbi[0,1])
new_df.groupby('no_times_resold')['resale_price'].mean().plot(kind='barh',ax=axesbi[0,2])
new_df.groupby('month')['resale_price'].mean().plot(kind='barh',ax=axesbi[0,3])
new_df.groupby('flat_type')['resale_price'].mean().plot(kind='barh',ax=axesbi[1,0])
new_df.groupby('flat_model')['resale_price'].mean().plot(kind='barh',ax=axesbi[1,1])
sns.boxplot(x="no_times_resold", y="age", data=new_df,ax=axesbi[1,2])
sns.boxplot(x="age", y="resale_price", data=new_df,ax=axesbi[1,3])
Out[39]:
In [43]:
sns.jointplot(x="age", y="no_times_resold", data=new_df);
In [45]:
f, ax = plt.subplots(figsize=(10, 8))
corr = new_df.corr()
sns.heatmap(corr,
mask=np.zeros_like(corr, dtype=np.bool),
cmap='hot',
square=True, ax=ax)
#resaleprice and no_times_resold
Out[45]:
In [47]:
ulimit = np.percentile(new_df.resale_price.values, 99.5)
llimit = np.percentile(new_df.resale_price.values, 0.5)
new_df['resale_price'].ix[new_df['resale_price']>ulimit] = ulimit
new_df['resale_price'].ix[new_df['resale_price']<llimit] = llimit
col = "floor_area_sqm"
ulimit = np.percentile(new_df[col].values, 99.5)
llimit = np.percentile(new_df[col].values, 0.5)
new_df[col].ix[new_df[col]>ulimit] = ulimit
new_df[col].ix[new_df[col]<llimit] = llimit
plt.figure(figsize=(12,12))
sns.jointplot(x=np.log1p(new_df.floor_area_sqm.values), y=np.log1p(new_df.resale_price.values), size=10)
plt.ylabel('Log of Price', fontsize=12)
plt.xlabel('Log of Total area in square metre', fontsize=12)
Out[47]:
In [48]:
plt.figure(figsize=(12,8))
sns.countplot(x="storey_range", data=new_df)
plt.ylabel('Count', fontsize=12)
plt.xlabel('storey_range', fontsize=12)
plt.xticks(rotation='vertical')
Out[48]:
In [22]:
grouped_df = new_df.groupby('storey_range')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(12,8))
sns.pointplot(grouped_df.storey_range.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Floor', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
In [24]:
grouped_df = new_df.groupby('town')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(12,8))
sns.pointplot(grouped_df.town.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Town', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
In [27]:
new_df.columns
Out[27]:
In [37]:
grouped_df = new_df.groupby('no_times_resold')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(15,15))
sns.pointplot(grouped_df.no_times_resold.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('no_times_resold', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
In [32]:
grouped_df = new_df.groupby('lease_commence_date')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(15,15))
sns.pointplot(grouped_df.lease_commence_date.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('lease_commence_date', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
In [31]:
grouped_df = new_df.groupby('floor_area_sqm')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(15,15))
sns.pointplot(grouped_df.floor_area_sqm.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Floor Area', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
In [28]:
grouped_df = new_df.groupby('flat_type')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(12,8))
sns.pointplot(grouped_df.flat_type.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Town', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
In [ ]: