Basic Imports


In [1]:
## Necessary Imports
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
color = sns.color_palette()

In [2]:
import re

In [2]:
PATH = "kaggle\\case\\"

In [3]:
!dir {PATH}


 Volume in drive D is Local Disk
 Volume Serial Number is B408-A348

 Directory of D:\Github\fastai\courses\ml1\kaggle\case

03-Feb-18  09:54 AM    <DIR>          .
03-Feb-18  09:54 AM    <DIR>          ..
01-Feb-18  07:18 PM    <DIR>          .ipynb_checkpoints
03-Feb-18  09:54 AM            32,890 2000.png
03-Feb-18  09:54 AM            32,912 2010.png
03-Feb-18  09:54 AM            22,215 2017.png
28-Jan-18  11:46 PM        21,059,832 Housing_Resale.xlsx
24-Jan-18  11:08 PM            14,379 Housing_Resale_Questions.docx
03-Feb-18  07:18 AM            10,594 Housing_Resale_tables.xlsx
01-Feb-18  06:54 PM        13,243,906 main_validation.csv
01-Feb-18  02:33 PM        96,500,645 modelling_data.csv
29-Jan-18  10:49 PM        55,825,324 train.csv
01-Feb-18  07:09 PM        84,385,072 training_data.csv
02-Feb-18  12:01 AM           117,412 weights
              11 File(s)    271,245,181 bytes
               3 Dir(s)  187,587,559,424 bytes free

In [4]:
df_raw = pd.read_csv(f'{PATH}training_data.csv')

In [5]:
df_raw.columns


Out[5]:
Index(['Unnamed: 0', 'temp', 'month', 'town', 'flat_type', 'block',
       'street_name', 'storey_range', 'floor_area_sqm', 'flat_model',
       'lease_commence_date', 'resale_price', 'building_id', 'year', 'age',
       'no_resold', 'no_times_resold'],
      dtype='object')

In [6]:
df_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393287 entries, 0 to 393286
Data columns (total 17 columns):
Unnamed: 0             393287 non-null int64
temp                   393287 non-null object
month                  393287 non-null object
town                   393287 non-null object
flat_type              393287 non-null object
block                  393287 non-null object
street_name            393287 non-null object
storey_range           393287 non-null object
floor_area_sqm         393287 non-null float64
flat_model             393287 non-null object
lease_commence_date    393287 non-null float64
resale_price           393287 non-null float64
building_id            393287 non-null object
year                   393287 non-null float64
age                    393287 non-null float64
no_resold              393287 non-null int64
no_times_resold        393287 non-null int64
dtypes: float64(5), int64(3), object(9)
memory usage: 51.0+ MB

In [8]:
df_raw.describe(include='all')


Out[8]:
year month flat_type Unnamed: 3 street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price Age
count 477833.000000 477833.00000 477833.0 442054 477833 477833 477833.000000 477833 477833.000000 4.778330e+05 0.0
unique NaN NaN 7.0 1 534 25 NaN 20 NaN NaN NaN
top NaN NaN 4.0 ROOM YISHUN RING RD 04 TO 06 NaN Model A NaN NaN NaN
freq NaN NaN 185387.0 442054 8171 119924 NaN 137439 NaN NaN NaN
mean 2007.352200 6.53927 NaN NaN NaN NaN 96.616555 NaN 1988.562422 3.194007e+05 NaN
std 5.043029 3.38208 NaN NaN NaN NaN 25.364864 NaN 9.576672 1.366493e+05 NaN
min 2000.000000 1.00000 NaN NaN NaN NaN 28.000000 NaN 1966.000000 2.800000e+04 NaN
25% 2003.000000 4.00000 NaN NaN NaN NaN 73.000000 NaN 1982.000000 2.160000e+05 NaN
50% 2007.000000 7.00000 NaN NaN NaN NaN 99.000000 NaN 1987.000000 3.000000e+05 NaN
75% 2011.000000 9.00000 NaN NaN NaN NaN 114.000000 NaN 1997.000000 3.999990e+05 NaN
max 2017.000000 12.00000 NaN NaN NaN NaN 297.000000 NaN 2015.000000 1.180000e+06 NaN

In [9]:
## Get A Quick Overview of What We Are Dealing With
sns.distplot(df_raw['resale_price']);



In [10]:
#skewness and kurtosis is Clearly Visible via this
print("Skewness: %f" % df_raw['resale_price'].skew())
print("Kurtosis: %f" % df_raw['resale_price'].kurt())


Skewness: 0.950600
Kurtosis: 1.252723

In [11]:
df_raw['resale_price'].describe()


Out[11]:
count    4.778330e+05
mean     3.194007e+05
std      1.366493e+05
min      2.800000e+04
25%      2.160000e+05
50%      3.000000e+05
75%      3.999990e+05
max      1.180000e+06
Name: resale_price, dtype: float64

In [12]:
def disply_dtype_plot(df = None):
    if df is None:
        return
    l = []
    cols = df.columns
    for i in cols:
        if df[i].dtype == 'int64':
            l.append('integer dtype')
        elif df[i].dtype == 'object':
            l.append('object dtype')
        elif df[i].dtype == 'float64':
            l.append('float dtype')
        else:
            pass
    sns.countplot(l)
    del l
disply_dtype_plot(df_raw)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)

In [9]:
df_raw.head(0)


Out[9]:
month town flat_type block street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price year age no_resold

In [8]:
df_raw.drop(['Unnamed: 0','temp','building_id','no_times_resold'], inplace=True, axis=1)

In [16]:
df_raw["Age"] = df_raw['year'] - df_raw['lease_commence_date']

Re


In [58]:
df_raw.tail(n=10)


Out[58]:
year month flat_type street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price Age
477823 2017 11 5 YISHUN ST 21 10 TO 12 121.0 Improved 1985 480000.0 32
477824 2017 11 5 YISHUN ST 81 04 TO 06 122.0 Improved 1988 490000.0 29
477825 2017 11 5 YISHUN ST 81 07 TO 09 122.0 Improved 1988 548000.0 29
477826 2017 11 EXECUTIVE YISHUN AVE 6 01 TO 03 142.0 Apartment 1988 465000.0 29
477827 2017 11 EXECUTIVE YISHUN ST 61 01 TO 03 142.0 Apartment 1987 515000.0 30
477828 2017 11 EXECUTIVE YISHUN ST 61 10 TO 12 164.0 Apartment 1992 728000.0 25
477829 2017 11 EXECUTIVE YISHUN ST 81 01 TO 03 154.0 Maisonette 1987 648888.0 30
477830 2017 11 EXECUTIVE YISHUN ST 81 07 TO 09 142.0 Apartment 1988 658000.0 29
477831 2017 11 EXECUTIVE YISHUN ST 81 04 TO 06 142.0 Apartment 1987 598000.0 30
477832 2017 11 MULTI-GENERATION YISHUN AVE 4 01 TO 03 179.0 Multi Generation 1987 695000.0 30

In [15]:
month_non_split = list(df_raw['month'])

In [19]:
s = re.split('-','2000-01');print(s[0]);print(s[1])


2000
01

In [20]:
year = []
mon = []
for i in month_non_split:
    year.append(re.split('-', i)[0])
    mon.append(re.split('-', i)[1])

In [24]:
df_raw.drop(['month'],axis =1,inplace=True)

In [25]:
df_raw['month'] = mon
df_raw['year'] = year

In [26]:
df_raw.columns


Out[26]:
Index(['town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price',
       'month', 'year'],
      dtype='object')

In [27]:
df_raw.head()


Out[27]:
town flat_type block street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price month year
0 ANG MO KIO 3 ROOM 216 ANG MO KIO AVE 1 07 TO 09 73.0 New Generation 1976 159000.0 01 2000
1 ANG MO KIO 3 ROOM 215 ANG MO KIO AVE 1 07 TO 09 73.0 New Generation 1976 167000.0 01 2000
2 ANG MO KIO 3 ROOM 218 ANG MO KIO AVE 1 07 TO 09 67.0 New Generation 1976 163000.0 01 2000
3 ANG MO KIO 4 ROOM 304 ANG MO KIO AVE 1 04 TO 06 97.0 New Generation 1977 260000.0 01 2000
4 ANG MO KIO 5 ROOM 221 ANG MO KIO AVE 1 10 TO 12 127.0 Improved 1993 408000.0 01 2000

Plots


In [27]:
var = 'Age'
plt.figure(figsize=(10,10))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x27a1f776cc0>

In [58]:
var = 'month'
plt.xticks(rotation = 45)
sns.countplot(df_raw.month)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a7401bd588>

In [59]:
plt.xticks(rotation = 45)
sns.countplot(df_raw.year)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a73faff780>

In [33]:
var = 'flat_type'
plt.xticks(rotation = 45)
sns.countplot(df_raw.flat_type)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a735485630>

In [43]:
from collections import Counter
Counter(df_raw['street_name'])


Out[43]:
Counter({'ADMIRALTY DR': 1763,
         'ADMIRALTY LINK': 1062,
         'AH HOOD RD': 75,
         'ALJUNIED CRES': 1164,
         'ALJUNIED RD': 70,
         'ANCHORVALE DR': 699,
         'ANCHORVALE LANE': 298,
         'ANCHORVALE LINK': 1253,
         'ANCHORVALE RD': 1085,
         'ANG MO KIO AVE 1': 2085,
         'ANG MO KIO AVE 10': 6656,
         'ANG MO KIO AVE 2': 107,
         'ANG MO KIO AVE 3': 5830,
         'ANG MO KIO AVE 4': 3651,
         'ANG MO KIO AVE 5': 3108,
         'ANG MO KIO AVE 6': 1236,
         'ANG MO KIO AVE 8': 880,
         'ANG MO KIO AVE 9': 395,
         'ANG MO KIO ST 11': 77,
         'ANG MO KIO ST 21': 90,
         'ANG MO KIO ST 31': 235,
         'ANG MO KIO ST 32': 369,
         'ANG MO KIO ST 52': 299,
         'BAIN ST': 90,
         'BALAM RD': 1049,
         'BANGKIT RD': 1003,
         'BEACH RD': 330,
         'BEDOK CTRL': 168,
         'BEDOK NTH AVE 1': 1123,
         'BEDOK NTH AVE 2': 1099,
         'BEDOK NTH AVE 3': 1010,
         'BEDOK NTH AVE 4': 1197,
         'BEDOK NTH RD': 3312,
         'BEDOK NTH ST 1': 800,
         'BEDOK NTH ST 2': 1322,
         'BEDOK NTH ST 3': 3659,
         'BEDOK NTH ST 4': 271,
         'BEDOK RESERVOIR RD': 6326,
         'BEDOK RESERVOIR VIEW': 868,
         'BEDOK STH AVE 1': 637,
         'BEDOK STH AVE 2': 996,
         'BEDOK STH AVE 3': 648,
         'BEDOK STH RD': 1254,
         'BENDEMEER RD': 860,
         'BEO CRES': 236,
         'BISHAN ST 11': 367,
         'BISHAN ST 12': 1923,
         'BISHAN ST 13': 2095,
         'BISHAN ST 22': 1398,
         'BISHAN ST 23': 1053,
         'BISHAN ST 24': 1017,
         'BOON KENG RD': 350,
         'BOON LAY AVE': 1250,
         'BOON LAY DR': 1267,
         'BOON LAY PL': 1876,
         'BOON TIONG RD': 262,
         'BRIGHT HILL DR': 142,
         'BT BATOK CTRL': 1297,
         'BT BATOK EAST AVE 3': 817,
         'BT BATOK EAST AVE 4': 1234,
         'BT BATOK EAST AVE 5': 1760,
         'BT BATOK ST 11': 660,
         'BT BATOK ST 21': 1556,
         'BT BATOK ST 22': 168,
         'BT BATOK ST 24': 882,
         'BT BATOK ST 25': 1030,
         'BT BATOK ST 31': 1572,
         'BT BATOK ST 32': 815,
         'BT BATOK ST 33': 832,
         'BT BATOK ST 34': 1049,
         'BT BATOK ST 51': 506,
         'BT BATOK ST 52': 1553,
         'BT BATOK WEST AVE 2': 516,
         'BT BATOK WEST AVE 4': 839,
         'BT BATOK WEST AVE 5': 871,
         'BT BATOK WEST AVE 6': 2483,
         'BT BATOK WEST AVE 7': 159,
         'BT BATOK WEST AVE 8': 1209,
         'BT MERAH CTRL': 155,
         'BT MERAH LANE 1': 25,
         'BT MERAH VIEW': 981,
         'BT PANJANG RING RD': 2124,
         'BT PURMEI RD': 970,
         'BUANGKOK CRES': 604,
         'BUANGKOK LINK': 199,
         'BUFFALO RD': 146,
         "C'WEALTH AVE": 187,
         "C'WEALTH AVE WEST": 357,
         "C'WEALTH CL": 951,
         "C'WEALTH CRES": 750,
         "C'WEALTH DR": 1619,
         'CAMBRIDGE RD': 260,
         'CANBERRA LINK': 160,
         'CANBERRA RD': 1510,
         'CANTONMENT CL': 462,
         'CANTONMENT RD': 255,
         'CASHEW RD': 293,
         'CASSIA CRES': 491,
         'CHAI CHEE AVE': 580,
         'CHAI CHEE DR': 337,
         'CHAI CHEE RD': 971,
         'CHAI CHEE ST': 1021,
         'CHANDER RD': 57,
         'CHANGI VILLAGE RD': 139,
         'CHIN SWEE RD': 48,
         'CHOA CHU KANG AVE 1': 202,
         'CHOA CHU KANG AVE 2': 1135,
         'CHOA CHU KANG AVE 3': 1840,
         'CHOA CHU KANG AVE 4': 2603,
         'CHOA CHU KANG AVE 5': 829,
         'CHOA CHU KANG CRES': 3064,
         'CHOA CHU KANG CTRL': 1577,
         'CHOA CHU KANG DR': 646,
         'CHOA CHU KANG LOOP': 198,
         'CHOA CHU KANG NTH 5': 631,
         'CHOA CHU KANG NTH 6': 1088,
         'CHOA CHU KANG NTH 7': 170,
         'CHOA CHU KANG ST 51': 963,
         'CHOA CHU KANG ST 52': 827,
         'CHOA CHU KANG ST 53': 397,
         'CHOA CHU KANG ST 54': 429,
         'CHOA CHU KANG ST 62': 1316,
         'CHOA CHU KANG ST 64': 504,
         'CIRCUIT RD': 2891,
         'CLARENCE LANE': 212,
         'CLEMENTI AVE 1': 406,
         'CLEMENTI AVE 2': 1780,
         'CLEMENTI AVE 3': 1297,
         'CLEMENTI AVE 4': 2022,
         'CLEMENTI AVE 5': 1475,
         'CLEMENTI AVE 6': 249,
         'CLEMENTI ST 11': 31,
         'CLEMENTI ST 12': 33,
         'CLEMENTI ST 13': 421,
         'CLEMENTI ST 14': 117,
         'CLEMENTI WEST ST 1': 1060,
         'CLEMENTI WEST ST 2': 1797,
         'COMPASSVALE BOW': 190,
         'COMPASSVALE CRES': 473,
         'COMPASSVALE DR': 617,
         'COMPASSVALE LANE': 675,
         'COMPASSVALE LINK': 786,
         'COMPASSVALE RD': 1090,
         'COMPASSVALE ST': 1377,
         'COMPASSVALE WALK': 1312,
         'CORPORATION DR': 390,
         'CRAWFORD LANE': 228,
         'DAKOTA CRES': 180,
         'DELTA AVE': 219,
         'DEPOT RD': 545,
         'DORSET RD': 189,
         'DOVER CL EAST': 154,
         'DOVER CRES': 780,
         'DOVER RD': 351,
         'EAST COAST RD': 45,
         'EDGEDALE PLAINS': 1402,
         'EDGEFIELD PLAINS': 1522,
         'ELIAS RD': 578,
         'EMPRESS RD': 120,
         'EUNOS CRES': 1462,
         'EUNOS RD 5': 245,
         'EVERTON PK': 250,
         'FAJAR RD': 1177,
         'FARRER PK RD': 276,
         'FARRER RD': 92,
         'FERNVALE LANE': 387,
         'FERNVALE LINK': 191,
         'FERNVALE RD': 1132,
         'FRENCH RD': 59,
         'GANGSA RD': 1345,
         'GEYLANG BAHRU': 1504,
         'GEYLANG EAST AVE 1': 459,
         'GEYLANG EAST AVE 2': 5,
         'GEYLANG EAST CTRL': 116,
         'GEYLANG SERAI': 110,
         'GHIM MOH LINK': 73,
         'GHIM MOH RD': 1387,
         'GLOUCESTER RD': 149,
         'HAIG RD': 801,
         'HAVELOCK RD': 343,
         'HENDERSON CRES': 417,
         'HENDERSON RD': 78,
         'HO CHING RD': 695,
         'HOLLAND AVE': 937,
         'HOLLAND CL': 965,
         'HOLLAND DR': 840,
         'HOUGANG AVE 1': 2157,
         'HOUGANG AVE 10': 1946,
         'HOUGANG AVE 2': 757,
         'HOUGANG AVE 3': 1666,
         'HOUGANG AVE 4': 1123,
         'HOUGANG AVE 5': 1831,
         'HOUGANG AVE 6': 1184,
         'HOUGANG AVE 7': 746,
         'HOUGANG AVE 8': 4667,
         'HOUGANG AVE 9': 1066,
         'HOUGANG CTRL': 685,
         'HOUGANG ST 11': 448,
         'HOUGANG ST 21': 355,
         'HOUGANG ST 22': 522,
         'HOUGANG ST 31': 213,
         'HOUGANG ST 51': 1653,
         'HOUGANG ST 52': 532,
         'HOUGANG ST 61': 825,
         'HOUGANG ST 91': 1896,
         'HOUGANG ST 92': 571,
         'HOY FATT RD': 213,
         'INDUS RD': 149,
         'JELAPANG RD': 2669,
         'JELEBU RD': 343,
         'JELLICOE RD': 116,
         'JLN BAHAGIA': 799,
         'JLN BATU': 662,
         'JLN BERSEH': 29,
         'JLN BT HO SWEE': 318,
         'JLN BT MERAH': 1820,
         'JLN DAMAI': 508,
         'JLN DUA': 112,
         'JLN DUSUN': 62,
         'JLN KAYU': 26,
         'JLN KLINIK': 290,
         'JLN KUKOH': 190,
         "JLN MA'MOR": 99,
         'JLN MEMBINA': 668,
         'JLN PASAR BARU': 36,
         'JLN RAJAH': 372,
         'JLN RUMAH TINGGI': 598,
         'JLN TECK WHYE': 996,
         'JLN TENAGA': 889,
         'JLN TENTERAM': 226,
         'JLN TIGA': 94,
         'JOO CHIAT RD': 112,
         'JOO SENG RD': 446,
         'JURONG EAST AVE 1': 689,
         'JURONG EAST ST 13': 991,
         'JURONG EAST ST 21': 2717,
         'JURONG EAST ST 24': 1589,
         'JURONG EAST ST 31': 589,
         'JURONG EAST ST 32': 1346,
         'JURONG WEST AVE 1': 1653,
         'JURONG WEST AVE 3': 326,
         'JURONG WEST AVE 5': 521,
         'JURONG WEST CTRL 1': 1667,
         'JURONG WEST ST 24': 415,
         'JURONG WEST ST 25': 401,
         'JURONG WEST ST 41': 2015,
         'JURONG WEST ST 42': 2757,
         'JURONG WEST ST 51': 45,
         'JURONG WEST ST 52': 2380,
         'JURONG WEST ST 61': 2458,
         'JURONG WEST ST 62': 199,
         'JURONG WEST ST 64': 1983,
         'JURONG WEST ST 65': 2923,
         'JURONG WEST ST 71': 788,
         'JURONG WEST ST 72': 117,
         'JURONG WEST ST 73': 672,
         'JURONG WEST ST 74': 714,
         'JURONG WEST ST 75': 206,
         'JURONG WEST ST 81': 2836,
         'JURONG WEST ST 91': 1876,
         'JURONG WEST ST 92': 688,
         'JURONG WEST ST 93': 644,
         'KALLANG BAHRU': 534,
         'KANG CHING RD': 882,
         'KELANTAN RD': 154,
         'KENT RD': 127,
         'KG ARANG RD': 106,
         'KG BAHRU HILL': 18,
         'KG KAYU RD': 52,
         'KIM CHENG ST': 49,
         'KIM KEAT AVE': 636,
         'KIM KEAT LINK': 300,
         'KIM PONG RD': 23,
         'KIM TIAN PL': 254,
         'KIM TIAN RD': 658,
         "KING GEORGE'S AVE": 145,
         'KLANG LANE': 141,
         'KRETA AYER RD': 39,
         'LENGKOK BAHRU': 457,
         'LENGKONG TIGA': 409,
         'LIM LIAK ST': 80,
         'LOMPANG RD': 582,
         'LOR 1 TOA PAYOH': 2657,
         'LOR 1A TOA PAYOH': 86,
         'LOR 2 TOA PAYOH': 1463,
         'LOR 3 GEYLANG': 78,
         'LOR 3 TOA PAYOH': 228,
         'LOR 4 TOA PAYOH': 1321,
         'LOR 5 TOA PAYOH': 1220,
         'LOR 6 TOA PAYOH': 261,
         'LOR 7 TOA PAYOH': 1534,
         'LOR 8 TOA PAYOH': 1566,
         'LOR AH SOO': 882,
         'LOR LEW LIAN': 534,
         'LOR LIMAU': 271,
         'LOWER DELTA RD': 47,
         'MACPHERSON LANE': 224,
         'MARGARET DR': 13,
         'MARINE CRES': 1158,
         'MARINE DR': 1160,
         'MARINE PARADE CTRL': 2,
         'MARINE TER': 1383,
         'MARSILING CRES': 324,
         'MARSILING DR': 3336,
         'MARSILING LANE': 828,
         'MARSILING RD': 296,
         'MARSILING RISE': 790,
         'MCNAIR RD': 680,
         'MEI LING ST': 956,
         'MOH GUAN TER': 74,
         'MONTREAL DR': 469,
         'MOULMEIN RD': 38,
         'NEW MKT RD': 56,
         'NEW UPP CHANGI RD': 1486,
         'NTH BRIDGE RD': 157,
         'OLD AIRPORT RD': 119,
         'OWEN RD': 73,
         'PANDAN GDNS': 880,
         'PASIR RIS DR 1': 635,
         'PASIR RIS DR 10': 877,
         'PASIR RIS DR 3': 456,
         'PASIR RIS DR 4': 1040,
         'PASIR RIS DR 6': 1877,
         'PASIR RIS ST 11': 1934,
         'PASIR RIS ST 12': 911,
         'PASIR RIS ST 13': 406,
         'PASIR RIS ST 21': 2068,
         'PASIR RIS ST 41': 110,
         'PASIR RIS ST 51': 1124,
         'PASIR RIS ST 52': 879,
         'PASIR RIS ST 53': 700,
         'PASIR RIS ST 71': 3107,
         'PASIR RIS ST 72': 671,
         'PAYA LEBAR WAY': 498,
         'PENDING RD': 1478,
         'PETIR RD': 1734,
         'PINE CL': 381,
         'PIPIT RD': 312,
         'POTONG PASIR AVE 1': 1328,
         'POTONG PASIR AVE 2': 168,
         'POTONG PASIR AVE 3': 343,
         'PUNGGOL CTRL': 1648,
         'PUNGGOL DR': 959,
         'PUNGGOL EAST': 110,
         'PUNGGOL FIELD': 1971,
         'PUNGGOL FIELD WALK': 198,
         'PUNGGOL PL': 303,
         'PUNGGOL RD': 256,
         'PUNGGOL WALK': 21,
         'QUEEN ST': 139,
         "QUEEN'S CL": 379,
         "QUEEN'S RD": 240,
         'QUEENSWAY': 119,
         'RACE COURSE RD': 192,
         'REDHILL CL': 748,
         'REDHILL LANE': 159,
         'REDHILL RD': 282,
         'RIVERVALE CRES': 1463,
         'RIVERVALE DR': 2495,
         'RIVERVALE ST': 854,
         'RIVERVALE WALK': 808,
         'ROCHOR RD': 168,
         'ROWELL RD': 189,
         'SAGO LANE': 118,
         'SAUJANA RD': 234,
         'SEGAR RD': 1113,
         'SELEGIE RD': 253,
         'SELETAR WEST FARMWAY 6': 8,
         'SEMBAWANG CL': 1340,
         'SEMBAWANG CRES': 444,
         'SEMBAWANG DR': 1573,
         'SEMBAWANG VISTA': 328,
         'SEMBAWANG WAY': 10,
         'SENG POH RD': 14,
         'SENGKANG CTRL': 235,
         'SENGKANG EAST RD': 228,
         'SENGKANG EAST WAY': 964,
         'SENGKANG WEST AVE': 113,
         'SENJA LINK': 168,
         'SENJA RD': 1788,
         'SERANGOON AVE 1': 350,
         'SERANGOON AVE 2': 1285,
         'SERANGOON AVE 3': 1249,
         'SERANGOON AVE 4': 1393,
         'SERANGOON CTRL': 1244,
         'SERANGOON CTRL DR': 340,
         'SERANGOON NTH AVE 1': 1953,
         'SERANGOON NTH AVE 2': 267,
         'SERANGOON NTH AVE 3': 860,
         'SERANGOON NTH AVE 4': 1923,
         'SHUNFU RD': 658,
         'SILAT AVE': 91,
         'SIMEI LANE': 68,
         'SIMEI RD': 464,
         'SIMEI ST 1': 2582,
         'SIMEI ST 2': 260,
         'SIMEI ST 4': 634,
         'SIMEI ST 5': 403,
         'SIMS AVE': 92,
         'SIMS DR': 693,
         'SIMS PL': 170,
         'SIN MING AVE': 1018,
         'SIN MING RD': 389,
         'SMITH ST': 116,
         'SPOTTISWOODE PK RD': 180,
         "ST. GEORGE'S LANE": 453,
         "ST. GEORGE'S RD": 843,
         'STIRLING RD': 1061,
         'STRATHMORE AVE': 835,
         'TAH CHING RD': 754,
         'TAMAN HO SWEE': 278,
         'TAMPINES AVE 4': 1095,
         'TAMPINES AVE 5': 763,
         'TAMPINES AVE 7': 241,
         'TAMPINES AVE 8': 552,
         'TAMPINES AVE 9': 650,
         'TAMPINES CTRL 1': 146,
         'TAMPINES CTRL 7': 165,
         'TAMPINES ST 11': 1361,
         'TAMPINES ST 12': 1259,
         'TAMPINES ST 21': 3554,
         'TAMPINES ST 22': 2050,
         'TAMPINES ST 23': 987,
         'TAMPINES ST 24': 583,
         'TAMPINES ST 32': 1095,
         'TAMPINES ST 33': 1760,
         'TAMPINES ST 34': 1234,
         'TAMPINES ST 41': 1970,
         'TAMPINES ST 42': 1310,
         'TAMPINES ST 43': 1024,
         'TAMPINES ST 44': 517,
         'TAMPINES ST 45': 1936,
         'TAMPINES ST 71': 2428,
         'TAMPINES ST 72': 1033,
         'TAMPINES ST 81': 1505,
         'TAMPINES ST 82': 1335,
         'TAMPINES ST 83': 1677,
         'TAMPINES ST 84': 675,
         'TAMPINES ST 91': 1050,
         'TANGLIN HALT RD': 1930,
         'TAO CHING RD': 107,
         'TEBAN GDNS RD': 1680,
         'TECK WHYE AVE': 669,
         'TECK WHYE LANE': 2046,
         'TELOK BLANGAH CRES': 1737,
         'TELOK BLANGAH DR': 1317,
         'TELOK BLANGAH HTS': 1069,
         'TELOK BLANGAH RISE': 576,
         'TELOK BLANGAH ST 31': 45,
         'TELOK BLANGAH WAY': 214,
         'TESSENSOHN RD': 42,
         'TG PAGAR PLAZA': 518,
         'TIONG BAHRU RD': 62,
         'TOA PAYOH CTRL': 437,
         'TOA PAYOH EAST': 431,
         'TOA PAYOH NTH': 983,
         'TOH GUAN RD': 1957,
         'TOH YI DR': 724,
         'TOWNER RD': 388,
         'UBI AVE 1': 1901,
         'UPP ALJUNIED LANE': 240,
         'UPP BOON KENG RD': 1001,
         'UPP CROSS ST': 310,
         'UPP SERANGOON RD': 80,
         'VEERASAMY RD': 201,
         'WATERLOO ST': 150,
         'WELLINGTON CIRCLE': 710,
         'WEST COAST DR': 693,
         'WEST COAST RD': 1305,
         'WHAMPOA DR': 1283,
         'WHAMPOA RD': 320,
         'WHAMPOA STH': 46,
         'WHAMPOA WEST': 221,
         'WOODLANDS AVE 1': 1435,
         'WOODLANDS AVE 3': 237,
         'WOODLANDS AVE 4': 1150,
         'WOODLANDS AVE 5': 768,
         'WOODLANDS AVE 6': 2257,
         'WOODLANDS AVE 9': 271,
         'WOODLANDS CIRCLE': 3079,
         'WOODLANDS CRES': 1196,
         'WOODLANDS CTR RD': 108,
         'WOODLANDS DR 14': 2622,
         'WOODLANDS DR 16': 1865,
         'WOODLANDS DR 40': 704,
         'WOODLANDS DR 42': 594,
         'WOODLANDS DR 44': 467,
         'WOODLANDS DR 50': 2383,
         'WOODLANDS DR 52': 745,
         'WOODLANDS DR 53': 403,
         'WOODLANDS DR 60': 1130,
         'WOODLANDS DR 62': 620,
         'WOODLANDS DR 70': 1023,
         'WOODLANDS DR 71': 263,
         'WOODLANDS DR 72': 374,
         'WOODLANDS DR 73': 639,
         'WOODLANDS DR 75': 1310,
         'WOODLANDS RING RD': 3351,
         'WOODLANDS ST 11': 232,
         'WOODLANDS ST 13': 2200,
         'WOODLANDS ST 31': 873,
         'WOODLANDS ST 32': 656,
         'WOODLANDS ST 41': 1978,
         'WOODLANDS ST 81': 909,
         'WOODLANDS ST 82': 1572,
         'WOODLANDS ST 83': 1478,
         'YISHUN AVE 11': 1102,
         'YISHUN AVE 2': 541,
         'YISHUN AVE 3': 724,
         'YISHUN AVE 4': 1239,
         'YISHUN AVE 5': 1892,
         'YISHUN AVE 6': 2029,
         'YISHUN AVE 7': 441,
         'YISHUN AVE 9': 354,
         'YISHUN CTRL': 669,
         'YISHUN CTRL 1': 160,
         'YISHUN RING RD': 8171,
         'YISHUN ST 11': 2349,
         'YISHUN ST 20': 219,
         'YISHUN ST 21': 2305,
         'YISHUN ST 22': 1664,
         'YISHUN ST 41': 59,
         'YISHUN ST 61': 2042,
         'YISHUN ST 71': 1366,
         'YISHUN ST 72': 1643,
         'YISHUN ST 81': 2074,
         'YUAN CHING RD': 94,
         'YUNG AN RD': 300,
         'YUNG HO RD': 150,
         'YUNG KUANG RD': 111,
         'YUNG LOH RD': 265,
         'YUNG PING RD': 311,
         'YUNG SHENG RD': 284,
         'ZION RD': 141})

In [47]:
var = 'lease_commence_date'
data = pd.concat([df_raw['resale_price'], df_raw[var]], axis=1)
data.plot.scatter(x=var, y='resale_price')


Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a7401b9c50>

In [48]:
var = 'year'
data = pd.concat([df_raw['resale_price'], df_raw[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="resale_price", data=data)
plt.xticks(rotation=90);


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:462: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  box_data = remove_na(group_data)

In [42]:
var = 'street_name'
plt.figure(figsize=(15,15))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a738309128>

In [38]:
var = 'town'
plt.figure(figsize=(8,8))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a742e40278>

In [39]:
var = 'storey_range'
plt.xticks(rotation = 45)
sns.countplot(df_raw[var])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a74227d710>

In [44]:
var = 'flat_model'
plt.figure(figsize=(8,8))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a73cb9cef0>

Here only few types are palying the role as such.. To name them they are NewGen,Improved,ModelA,Standard,Apart,Simplified,Premium,Maisonette rest we can remove them

Unique values


In [64]:
train_cats(df_raw)
#it's a helper function to aotumate the boring stuffs..
# For Further Insights do a `shift+tab' or a `??train_cats`

In [65]:
df_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 477833 entries, 0 to 477832
Data columns (total 11 columns):
town                   477833 non-null category
flat_type              477833 non-null category
block                  477833 non-null category
street_name            477833 non-null category
storey_range           477833 non-null category
floor_area_sqm         477833 non-null float64
flat_model             477833 non-null category
lease_commence_date    477833 non-null int64
resale_price           477833 non-null float64
month                  477833 non-null category
year                   477833 non-null category
dtypes: category(8), float64(2), int64(1)
memory usage: 15.6 MB

In [66]:
df_raw['month'].cat.codes


Out[66]:
0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
24         0
25         0
26         0
27         0
28         0
29         0
          ..
477803    10
477804    10
477805    10
477806    10
477807    10
477808    10
477809    10
477810    10
477811    10
477812    10
477813    10
477814    10
477815    10
477816    10
477817    10
477818    10
477819    10
477820    10
477821    10
477822    10
477823    10
477824    10
477825    10
477826    10
477827    10
477828    10
477829    10
477830    10
477831    10
477832    10
Length: 477833, dtype: int8

In [68]:
#correlation matrix
corrmat = df_raw.corr()
sns.heatmap(corrmat, vmax=.8, square=True);


RF


In [10]:
df_raw.resale_price = np.log(df_raw.resale_price)

In [11]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)
    return(res)

In [14]:
train_cats(df_raw)

In [15]:
df, y, _ = proc_df(df_raw, 'resale_price')

In [16]:
df.columns


Out[16]:
Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'year', 'age',
       'no_resold'],
      dtype='object')

In [17]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)


Out[17]:
0.99530658728560062

In [18]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 40000
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)
X_train.shape, y_train.shape, X_valid.shape


Out[18]:
((353287, 12), (353287,), (40000, 12))

In [19]:
m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
m.score(df,y)


Out[19]:
0.77697789832166875

In [20]:
draw_tree(m.estimators_[0], X_train, precision=3)


Tree 0 floor_area_sqm ≤ 84.5 mse = 0.164 samples = 353287 value = 12.487 1 month ≤ 100.5 mse = 0.121 samples = 131719 value = 12.172 0->1 True 8 month ≤ 101.5 mse = 0.096 samples = 221568 value = 12.675 0->8 False 2 floor_area_sqm ≤ 74.5 mse = 0.052 samples = 91114 value = 11.995 1->2 5 month ≤ 126.5 mse = 0.049 samples = 40605 value = 12.569 1->5 3 mse = 0.045 samples = 72021 value = 11.945 2->3 4 mse = 0.035 samples = 19093 value = 12.18 2->4 6 mse = 0.034 samples = 20325 value = 12.433 5->6 7 mse = 0.026 samples = 20280 value = 12.706 5->7 9 flat_type ≤ 4.5 mse = 0.067 samples = 149709 value = 12.55 8->9 12 month ≤ 127.5 mse = 0.055 samples = 71859 value = 12.937 8->12 10 mse = 0.032 samples = 83299 value = 12.393 9->10 11 mse = 0.041 samples = 66410 value = 12.747 9->11 13 mse = 0.044 samples = 40258 value = 12.835 12->13 14 mse = 0.039 samples = 31601 value = 13.066 12->14

In [21]:
fi = rf_feat_importance(m, df); fi[:10]


Out[21]:
cols imp
6 floor_area_sqm 0.487445
0 month 0.408782
2 flat_type 0.103774
1 town 0.000000
3 block 0.000000
4 street_name 0.000000
5 storey_range 0.000000
7 flat_model 0.000000
8 lease_commence_date 0.000000
9 year 0.000000

In [22]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False);



In [23]:
def plot_fi(fi): 
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [24]:
plot_fi(fi[:10]);



In [25]:
from scipy.cluster import hierarchy as hc

In [26]:
corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(10,10))
dendrogram = hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=16)
plt.show()


Partial Dependence


In [28]:
from pdpbox import pdp
from plotnine import *

In [29]:
set_rf_samples(10000)

In [30]:
df_trn2, y_trn, _ = proc_df(df_raw, 'resale_price', max_n_cat=20)
X_train, X_valid = split_vals(df_trn2, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.6, n_jobs=-1)
m.fit(X_train, y_train);

In [31]:
plot_fi(rf_feat_importance(m, df_trn2)[:20]);



In [32]:
x = get_sample(X_train, 500)

In [43]:
def plot_pdp(feat, clusters=None, feat_name=None):
    feat_name = feat_name or feat
    p = pdp.pdp_isolate(m, x, feat)
    return pdp.pdp_plot(p, feat_name, plot_lines=True, 
                        cluster=clusters is not None, n_cluster_centers=clusters)

In [48]:
df_raw.resale_price = np.exp(df_raw.resale_price)

In [49]:
x_all = get_sample(df_raw, 500)

In [50]:
ggplot(x_all, aes('year', 'resale_price'))+stat_smooth(se=True, method='loess')


Out[50]:
<ggplot: (91195899298)>

In [51]:
plot_pdp('year')