Basic Imports


In [1]:
## Necessary Imports
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
color = sns.color_palette()

In [2]:
import re

In [2]:
PATH = "kaggle\\case\\"

In [3]:
!dir {PATH}


 Volume in drive D is Local Disk
 Volume Serial Number is B408-A348

 Directory of D:\Github\fastai\courses\ml1\kaggle\case

03-Feb-18  09:54 AM    <DIR>          .
03-Feb-18  09:54 AM    <DIR>          ..
01-Feb-18  07:18 PM    <DIR>          .ipynb_checkpoints
03-Feb-18  09:54 AM            32,890 2000.png
03-Feb-18  09:54 AM            32,912 2010.png
03-Feb-18  09:54 AM            22,215 2017.png
28-Jan-18  11:46 PM        21,059,832 Housing_Resale.xlsx
24-Jan-18  11:08 PM            14,379 Housing_Resale_Questions.docx
03-Feb-18  07:18 AM            10,594 Housing_Resale_tables.xlsx
01-Feb-18  06:54 PM        13,243,906 main_validation.csv
01-Feb-18  02:33 PM        96,500,645 modelling_data.csv
29-Jan-18  10:49 PM        55,825,324 train.csv
01-Feb-18  07:09 PM        84,385,072 training_data.csv
02-Feb-18  12:01 AM           117,412 weights
              11 File(s)    271,245,181 bytes
               3 Dir(s)  187,587,559,424 bytes free

In [4]:
df_raw = pd.read_csv(f'{PATH}training_data.csv')

In [5]:
df_raw.columns


Out[5]:
Index(['Unnamed: 0', 'temp', 'month', 'town', 'flat_type', 'block',
       'street_name', 'storey_range', 'floor_area_sqm', 'flat_model',
       'lease_commence_date', 'resale_price', 'building_id', 'year', 'age',
       'no_resold', 'no_times_resold'],
      dtype='object')

In [6]:
df_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 393287 entries, 0 to 393286
Data columns (total 17 columns):
Unnamed: 0             393287 non-null int64
temp                   393287 non-null object
month                  393287 non-null object
town                   393287 non-null object
flat_type              393287 non-null object
block                  393287 non-null object
street_name            393287 non-null object
storey_range           393287 non-null object
floor_area_sqm         393287 non-null float64
flat_model             393287 non-null object
lease_commence_date    393287 non-null float64
resale_price           393287 non-null float64
building_id            393287 non-null object
year                   393287 non-null float64
age                    393287 non-null float64
no_resold              393287 non-null int64
no_times_resold        393287 non-null int64
dtypes: float64(5), int64(3), object(9)
memory usage: 51.0+ MB

In [8]:
df_raw.describe(include='all')


Out[8]:
year month flat_type Unnamed: 3 street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price Age
count 477833.000000 477833.00000 477833.0 442054 477833 477833 477833.000000 477833 477833.000000 4.778330e+05 0.0
unique NaN NaN 7.0 1 534 25 NaN 20 NaN NaN NaN
top NaN NaN 4.0 ROOM YISHUN RING RD 04 TO 06 NaN Model A NaN NaN NaN
freq NaN NaN 185387.0 442054 8171 119924 NaN 137439 NaN NaN NaN
mean 2007.352200 6.53927 NaN NaN NaN NaN 96.616555 NaN 1988.562422 3.194007e+05 NaN
std 5.043029 3.38208 NaN NaN NaN NaN 25.364864 NaN 9.576672 1.366493e+05 NaN
min 2000.000000 1.00000 NaN NaN NaN NaN 28.000000 NaN 1966.000000 2.800000e+04 NaN
25% 2003.000000 4.00000 NaN NaN NaN NaN 73.000000 NaN 1982.000000 2.160000e+05 NaN
50% 2007.000000 7.00000 NaN NaN NaN NaN 99.000000 NaN 1987.000000 3.000000e+05 NaN
75% 2011.000000 9.00000 NaN NaN NaN NaN 114.000000 NaN 1997.000000 3.999990e+05 NaN
max 2017.000000 12.00000 NaN NaN NaN NaN 297.000000 NaN 2015.000000 1.180000e+06 NaN

In [9]:
## Get A Quick Overview of What We Are Dealing With
sns.distplot(df_raw['resale_price']);



In [10]:
#skewness and kurtosis is Clearly Visible via this
print("Skewness: %f" % df_raw['resale_price'].skew())
print("Kurtosis: %f" % df_raw['resale_price'].kurt())


Skewness: 0.950600
Kurtosis: 1.252723

In [11]:
df_raw['resale_price'].describe()


Out[11]:
count    4.778330e+05
mean     3.194007e+05
std      1.366493e+05
min      2.800000e+04
25%      2.160000e+05
50%      3.000000e+05
75%      3.999990e+05
max      1.180000e+06
Name: resale_price, dtype: float64

In [12]:
def disply_dtype_plot(df = None):
    if df is None:
        return
    l = []
    cols = df.columns
    for i in cols:
        if df[i].dtype == 'int64':
            l.append('integer dtype')
        elif df[i].dtype == 'object':
            l.append('object dtype')
        elif df[i].dtype == 'float64':
            l.append('float dtype')
        else:
            pass
    sns.countplot(l)
    del l
disply_dtype_plot(df_raw)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)

In [9]:
df_raw.head(0)


Out[9]:
month town flat_type block street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price year age no_resold

In [8]:
df_raw.drop(['Unnamed: 0','temp','building_id','no_times_resold'], inplace=True, axis=1)

In [16]:
df_raw["Age"] = df_raw['year'] - df_raw['lease_commence_date']

Re


In [58]:
df_raw.tail(n=10)


Out[58]:
year month flat_type street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price Age
477823 2017 11 5 YISHUN ST 21 10 TO 12 121.0 Improved 1985 480000.0 32
477824 2017 11 5 YISHUN ST 81 04 TO 06 122.0 Improved 1988 490000.0 29
477825 2017 11 5 YISHUN ST 81 07 TO 09 122.0 Improved 1988 548000.0 29
477826 2017 11 EXECUTIVE YISHUN AVE 6 01 TO 03 142.0 Apartment 1988 465000.0 29
477827 2017 11 EXECUTIVE YISHUN ST 61 01 TO 03 142.0 Apartment 1987 515000.0 30
477828 2017 11 EXECUTIVE YISHUN ST 61 10 TO 12 164.0 Apartment 1992 728000.0 25
477829 2017 11 EXECUTIVE YISHUN ST 81 01 TO 03 154.0 Maisonette 1987 648888.0 30
477830 2017 11 EXECUTIVE YISHUN ST 81 07 TO 09 142.0 Apartment 1988 658000.0 29
477831 2017 11 EXECUTIVE YISHUN ST 81 04 TO 06 142.0 Apartment 1987 598000.0 30
477832 2017 11 MULTI-GENERATION YISHUN AVE 4 01 TO 03 179.0 Multi Generation 1987 695000.0 30

In [15]:
month_non_split = list(df_raw['month'])

In [19]:
s = re.split('-','2000-01');print(s[0]);print(s[1])


2000
01

In [20]:
year = []
mon = []
for i in month_non_split:
    year.append(re.split('-', i)[0])
    mon.append(re.split('-', i)[1])

In [24]:
df_raw.drop(['month'],axis =1,inplace=True)

In [25]:
df_raw['month'] = mon
df_raw['year'] = year

In [26]:
df_raw.columns


Out[26]:
Index(['town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price',
       'month', 'year'],
      dtype='object')

In [27]:
df_raw.head()


Out[27]:
town flat_type block street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price month year
0 ANG MO KIO 3 ROOM 216 ANG MO KIO AVE 1 07 TO 09 73.0 New Generation 1976 159000.0 01 2000
1 ANG MO KIO 3 ROOM 215 ANG MO KIO AVE 1 07 TO 09 73.0 New Generation 1976 167000.0 01 2000
2 ANG MO KIO 3 ROOM 218 ANG MO KIO AVE 1 07 TO 09 67.0 New Generation 1976 163000.0 01 2000
3 ANG MO KIO 4 ROOM 304 ANG MO KIO AVE 1 04 TO 06 97.0 New Generation 1977 260000.0 01 2000
4 ANG MO KIO 5 ROOM 221 ANG MO KIO AVE 1 10 TO 12 127.0 Improved 1993 408000.0 01 2000

Plots


In [27]:
var = 'Age'
plt.figure(figsize=(10,10))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x27a1f776cc0>

In [58]:
var = 'month'
plt.xticks(rotation = 45)
sns.countplot(df_raw.month)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[58]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a7401bd588>

In [59]:
plt.xticks(rotation = 45)
sns.countplot(df_raw.year)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[59]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a73faff780>

In [33]:
var = 'flat_type'
plt.xticks(rotation = 45)
sns.countplot(df_raw.flat_type)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[33]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a735485630>

In [43]:
from collections import Counter
Counter(df_raw['street_name'])


Out[43]:
Counter({'ADMIRALTY DR': 1763,
         'ADMIRALTY LINK': 1062,
         'AH HOOD RD': 75,
         'ALJUNIED CRES': 1164,
         'ALJUNIED RD': 70,
         'ANCHORVALE DR': 699,
         'ANCHORVALE LANE': 298,
         'ANCHORVALE LINK': 1253,
         'ANCHORVALE RD': 1085,
         'ANG MO KIO AVE 1': 2085,
         'ANG MO KIO AVE 10': 6656,
         'ANG MO KIO AVE 2': 107,
         'ANG MO KIO AVE 3': 5830,
         'ANG MO KIO AVE 4': 3651,
         'ANG MO KIO AVE 5': 3108,
         'ANG MO KIO AVE 6': 1236,
         'ANG MO KIO AVE 8': 880,
         'ANG MO KIO AVE 9': 395,
         'ANG MO KIO ST 11': 77,
         'ANG MO KIO ST 21': 90,
         'ANG MO KIO ST 31': 235,
         'ANG MO KIO ST 32': 369,
         'ANG MO KIO ST 52': 299,
         'BAIN ST': 90,
         'BALAM RD': 1049,
         'BANGKIT RD': 1003,
         'BEACH RD': 330,
         'BEDOK CTRL': 168,
         'BEDOK NTH AVE 1': 1123,
         'BEDOK NTH AVE 2': 1099,
         'BEDOK NTH AVE 3': 1010,
         'BEDOK NTH AVE 4': 1197,
         'BEDOK NTH RD': 3312,
         'BEDOK NTH ST 1': 800,
         'BEDOK NTH ST 2': 1322,
         'BEDOK NTH ST 3': 3659,
         'BEDOK NTH ST 4': 271,
         'BEDOK RESERVOIR RD': 6326,
         'BEDOK RESERVOIR VIEW': 868,
         'BEDOK STH AVE 1': 637,
         'BEDOK STH AVE 2': 996,
         'BEDOK STH AVE 3': 648,
         'BEDOK STH RD': 1254,
         'BENDEMEER RD': 860,
         'BEO CRES': 236,
         'BISHAN ST 11': 367,
         'BISHAN ST 12': 1923,
         'BISHAN ST 13': 2095,
         'BISHAN ST 22': 1398,
         'BISHAN ST 23': 1053,
         'BISHAN ST 24': 1017,
         'BOON KENG RD': 350,
         'BOON LAY AVE': 1250,
         'BOON LAY DR': 1267,
         'BOON LAY PL': 1876,
         'BOON TIONG RD': 262,
         'BRIGHT HILL DR': 142,
         'BT BATOK CTRL': 1297,
         'BT BATOK EAST AVE 3': 817,
         'BT BATOK EAST AVE 4': 1234,
         'BT BATOK EAST AVE 5': 1760,
         'BT BATOK ST 11': 660,
         'BT BATOK ST 21': 1556,
         'BT BATOK ST 22': 168,
         'BT BATOK ST 24': 882,
         'BT BATOK ST 25': 1030,
         'BT BATOK ST 31': 1572,
         'BT BATOK ST 32': 815,
         'BT BATOK ST 33': 832,
         'BT BATOK ST 34': 1049,
         'BT BATOK ST 51': 506,
         'BT BATOK ST 52': 1553,
         'BT BATOK WEST AVE 2': 516,
         'BT BATOK WEST AVE 4': 839,
         'BT BATOK WEST AVE 5': 871,
         'BT BATOK WEST AVE 6': 2483,
         'BT BATOK WEST AVE 7': 159,
         'BT BATOK WEST AVE 8': 1209,
         'BT MERAH CTRL': 155,
         'BT MERAH LANE 1': 25,
         'BT MERAH VIEW': 981,
         'BT PANJANG RING RD': 2124,
         'BT PURMEI RD': 970,
         'BUANGKOK CRES': 604,
         'BUANGKOK LINK': 199,
         'BUFFALO RD': 146,
         "C'WEALTH AVE": 187,
         "C'WEALTH AVE WEST": 357,
         "C'WEALTH CL": 951,
         "C'WEALTH CRES": 750,
         "C'WEALTH DR": 1619,
         'CAMBRIDGE RD': 260,
         'CANBERRA LINK': 160,
         'CANBERRA RD': 1510,
         'CANTONMENT CL': 462,
         'CANTONMENT RD': 255,
         'CASHEW RD': 293,
         'CASSIA CRES': 491,
         'CHAI CHEE AVE': 580,
         'CHAI CHEE DR': 337,
         'CHAI CHEE RD': 971,
         'CHAI CHEE ST': 1021,
         'CHANDER RD': 57,
         'CHANGI VILLAGE RD': 139,
         'CHIN SWEE RD': 48,
         'CHOA CHU KANG AVE 1': 202,
         'CHOA CHU KANG AVE 2': 1135,
         'CHOA CHU KANG AVE 3': 1840,
         'CHOA CHU KANG AVE 4': 2603,
         'CHOA CHU KANG AVE 5': 829,
         'CHOA CHU KANG CRES': 3064,
         'CHOA CHU KANG CTRL': 1577,
         'CHOA CHU KANG DR': 646,
         'CHOA CHU KANG LOOP': 198,
         'CHOA CHU KANG NTH 5': 631,
         'CHOA CHU KANG NTH 6': 1088,
         'CHOA CHU KANG NTH 7': 170,
         'CHOA CHU KANG ST 51': 963,
         'CHOA CHU KANG ST 52': 827,
         'CHOA CHU KANG ST 53': 397,
         'CHOA CHU KANG ST 54': 429,
         'CHOA CHU KANG ST 62': 1316,
         'CHOA CHU KANG ST 64': 504,
         'CIRCUIT RD': 2891,
         'CLARENCE LANE': 212,
         'CLEMENTI AVE 1': 406,
         'CLEMENTI AVE 2': 1780,
         'CLEMENTI AVE 3': 1297,
         'CLEMENTI AVE 4': 2022,
         'CLEMENTI AVE 5': 1475,
         'CLEMENTI AVE 6': 249,
         'CLEMENTI ST 11': 31,
         'CLEMENTI ST 12': 33,
         'CLEMENTI ST 13': 421,
         'CLEMENTI ST 14': 117,
         'CLEMENTI WEST ST 1': 1060,
         'CLEMENTI WEST ST 2': 1797,
         'COMPASSVALE BOW': 190,
         'COMPASSVALE CRES': 473,
         'COMPASSVALE DR': 617,
         'COMPASSVALE LANE': 675,
         'COMPASSVALE LINK': 786,
         'COMPASSVALE RD': 1090,
         'COMPASSVALE ST': 1377,
         'COMPASSVALE WALK': 1312,
         'CORPORATION DR': 390,
         'CRAWFORD LANE': 228,
         'DAKOTA CRES': 180,
         'DELTA AVE': 219,
         'DEPOT RD': 545,
         'DORSET RD': 189,
         'DOVER CL EAST': 154,
         'DOVER CRES': 780,
         'DOVER RD': 351,
         'EAST COAST RD': 45,
         'EDGEDALE PLAINS': 1402,
         'EDGEFIELD PLAINS': 1522,
         'ELIAS RD': 578,
         'EMPRESS RD': 120,
         'EUNOS CRES': 1462,
         'EUNOS RD 5': 245,
         'EVERTON PK': 250,
         'FAJAR RD': 1177,
         'FARRER PK RD': 276,
         'FARRER RD': 92,
         'FERNVALE LANE': 387,
         'FERNVALE LINK': 191,
         'FERNVALE RD': 1132,
         'FRENCH RD': 59,
         'GANGSA RD': 1345,
         'GEYLANG BAHRU': 1504,
         'GEYLANG EAST AVE 1': 459,
         'GEYLANG EAST AVE 2': 5,
         'GEYLANG EAST CTRL': 116,
         'GEYLANG SERAI': 110,
         'GHIM MOH LINK': 73,
         'GHIM MOH RD': 1387,
         'GLOUCESTER RD': 149,
         'HAIG RD': 801,
         'HAVELOCK RD': 343,
         'HENDERSON CRES': 417,
         'HENDERSON RD': 78,
         'HO CHING RD': 695,
         'HOLLAND AVE': 937,
         'HOLLAND CL': 965,
         'HOLLAND DR': 840,
         'HOUGANG AVE 1': 2157,
         'HOUGANG AVE 10': 1946,
         'HOUGANG AVE 2': 757,
         'HOUGANG AVE 3': 1666,
         'HOUGANG AVE 4': 1123,
         'HOUGANG AVE 5': 1831,
         'HOUGANG AVE 6': 1184,
         'HOUGANG AVE 7': 746,
         'HOUGANG AVE 8': 4667,
         'HOUGANG AVE 9': 1066,
         'HOUGANG CTRL': 685,
         'HOUGANG ST 11': 448,
         'HOUGANG ST 21': 355,
         'HOUGANG ST 22': 522,
         'HOUGANG ST 31': 213,
         'HOUGANG ST 51': 1653,
         'HOUGANG ST 52': 532,
         'HOUGANG ST 61': 825,
         'HOUGANG ST 91': 1896,
         'HOUGANG ST 92': 571,
         'HOY FATT RD': 213,
         'INDUS RD': 149,
         'JELAPANG RD': 2669,
         'JELEBU RD': 343,
         'JELLICOE RD': 116,
         'JLN BAHAGIA': 799,
         'JLN BATU': 662,
         'JLN BERSEH': 29,
         'JLN BT HO SWEE': 318,
         'JLN BT MERAH': 1820,
         'JLN DAMAI': 508,
         'JLN DUA': 112,
         'JLN DUSUN': 62,
         'JLN KAYU': 26,
         'JLN KLINIK': 290,
         'JLN KUKOH': 190,
         "JLN MA'MOR": 99,
         'JLN MEMBINA': 668,
         'JLN PASAR BARU': 36,
         'JLN RAJAH': 372,
         'JLN RUMAH TINGGI': 598,
         'JLN TECK WHYE': 996,
         'JLN TENAGA': 889,
         'JLN TENTERAM': 226,
         'JLN TIGA': 94,
         'JOO CHIAT RD': 112,
         'JOO SENG RD': 446,
         'JURONG EAST AVE 1': 689,
         'JURONG EAST ST 13': 991,
         'JURONG EAST ST 21': 2717,
         'JURONG EAST ST 24': 1589,
         'JURONG EAST ST 31': 589,
         'JURONG EAST ST 32': 1346,
         'JURONG WEST AVE 1': 1653,
         'JURONG WEST AVE 3': 326,
         'JURONG WEST AVE 5': 521,
         'JURONG WEST CTRL 1': 1667,
         'JURONG WEST ST 24': 415,
         'JURONG WEST ST 25': 401,
         'JURONG WEST ST 41': 2015,
         'JURONG WEST ST 42': 2757,
         'JURONG WEST ST 51': 45,
         'JURONG WEST ST 52': 2380,
         'JURONG WEST ST 61': 2458,
         'JURONG WEST ST 62': 199,
         'JURONG WEST ST 64': 1983,
         'JURONG WEST ST 65': 2923,
         'JURONG WEST ST 71': 788,
         'JURONG WEST ST 72': 117,
         'JURONG WEST ST 73': 672,
         'JURONG WEST ST 74': 714,
         'JURONG WEST ST 75': 206,
         'JURONG WEST ST 81': 2836,
         'JURONG WEST ST 91': 1876,
         'JURONG WEST ST 92': 688,
         'JURONG WEST ST 93': 644,
         'KALLANG BAHRU': 534,
         'KANG CHING RD': 882,
         'KELANTAN RD': 154,
         'KENT RD': 127,
         'KG ARANG RD': 106,
         'KG BAHRU HILL': 18,
         'KG KAYU RD': 52,
         'KIM CHENG ST': 49,
         'KIM KEAT AVE': 636,
         'KIM KEAT LINK': 300,
         'KIM PONG RD': 23,
         'KIM TIAN PL': 254,
         'KIM TIAN RD': 658,
         "KING GEORGE'S AVE": 145,
         'KLANG LANE': 141,
         'KRETA AYER RD': 39,
         'LENGKOK BAHRU': 457,
         'LENGKONG TIGA': 409,
         'LIM LIAK ST': 80,
         'LOMPANG RD': 582,
         'LOR 1 TOA PAYOH': 2657,
         'LOR 1A TOA PAYOH': 86,
         'LOR 2 TOA PAYOH': 1463,
         'LOR 3 GEYLANG': 78,
         'LOR 3 TOA PAYOH': 228,
         'LOR 4 TOA PAYOH': 1321,
         'LOR 5 TOA PAYOH': 1220,
         'LOR 6 TOA PAYOH': 261,
         'LOR 7 TOA PAYOH': 1534,
         'LOR 8 TOA PAYOH': 1566,
         'LOR AH SOO': 882,
         'LOR LEW LIAN': 534,
         'LOR LIMAU': 271,
         'LOWER DELTA RD': 47,
         'MACPHERSON LANE': 224,
         'MARGARET DR': 13,
         'MARINE CRES': 1158,
         'MARINE DR': 1160,
         'MARINE PARADE CTRL': 2,
         'MARINE TER': 1383,
         'MARSILING CRES': 324,
         'MARSILING DR': 3336,
         'MARSILING LANE': 828,
         'MARSILING RD': 296,
         'MARSILING RISE': 790,
         'MCNAIR RD': 680,
         'MEI LING ST': 956,
         'MOH GUAN TER': 74,
         'MONTREAL DR': 469,
         'MOULMEIN RD': 38,
         'NEW MKT RD': 56,
         'NEW UPP CHANGI RD': 1486,
         'NTH BRIDGE RD': 157,
         'OLD AIRPORT RD': 119,
         'OWEN RD': 73,
         'PANDAN GDNS': 880,
         'PASIR RIS DR 1': 635,
         'PASIR RIS DR 10': 877,
         'PASIR RIS DR 3': 456,
         'PASIR RIS DR 4': 1040,
         'PASIR RIS DR 6': 1877,
         'PASIR RIS ST 11': 1934,
         'PASIR RIS ST 12': 911,
         'PASIR RIS ST 13': 406,
         'PASIR RIS ST 21': 2068,
         'PASIR RIS ST 41': 110,
         'PASIR RIS ST 51': 1124,
         'PASIR RIS ST 52': 879,
         'PASIR RIS ST 53': 700,
         'PASIR RIS ST 71': 3107,
         'PASIR RIS ST 72': 671,
         'PAYA LEBAR WAY': 498,
         'PENDING RD': 1478,
         'PETIR RD': 1734,
         'PINE CL': 381,
         'PIPIT RD': 312,
         'POTONG PASIR AVE 1': 1328,
         'POTONG PASIR AVE 2': 168,
         'POTONG PASIR AVE 3': 343,
         'PUNGGOL CTRL': 1648,
         'PUNGGOL DR': 959,
         'PUNGGOL EAST': 110,
         'PUNGGOL FIELD': 1971,
         'PUNGGOL FIELD WALK': 198,
         'PUNGGOL PL': 303,
         'PUNGGOL RD': 256,
         'PUNGGOL WALK': 21,
         'QUEEN ST': 139,
         "QUEEN'S CL": 379,
         "QUEEN'S RD": 240,
         'QUEENSWAY': 119,
         'RACE COURSE RD': 192,
         'REDHILL CL': 748,
         'REDHILL LANE': 159,
         'REDHILL RD': 282,
         'RIVERVALE CRES': 1463,
         'RIVERVALE DR': 2495,
         'RIVERVALE ST': 854,
         'RIVERVALE WALK': 808,
         'ROCHOR RD': 168,
         'ROWELL RD': 189,
         'SAGO LANE': 118,
         'SAUJANA RD': 234,
         'SEGAR RD': 1113,
         'SELEGIE RD': 253,
         'SELETAR WEST FARMWAY 6': 8,
         'SEMBAWANG CL': 1340,
         'SEMBAWANG CRES': 444,
         'SEMBAWANG DR': 1573,
         'SEMBAWANG VISTA': 328,
         'SEMBAWANG WAY': 10,
         'SENG POH RD': 14,
         'SENGKANG CTRL': 235,
         'SENGKANG EAST RD': 228,
         'SENGKANG EAST WAY': 964,
         'SENGKANG WEST AVE': 113,
         'SENJA LINK': 168,
         'SENJA RD': 1788,
         'SERANGOON AVE 1': 350,
         'SERANGOON AVE 2': 1285,
         'SERANGOON AVE 3': 1249,
         'SERANGOON AVE 4': 1393,
         'SERANGOON CTRL': 1244,
         'SERANGOON CTRL DR': 340,
         'SERANGOON NTH AVE 1': 1953,
         'SERANGOON NTH AVE 2': 267,
         'SERANGOON NTH AVE 3': 860,
         'SERANGOON NTH AVE 4': 1923,
         'SHUNFU RD': 658,
         'SILAT AVE': 91,
         'SIMEI LANE': 68,
         'SIMEI RD': 464,
         'SIMEI ST 1': 2582,
         'SIMEI ST 2': 260,
         'SIMEI ST 4': 634,
         'SIMEI ST 5': 403,
         'SIMS AVE': 92,
         'SIMS DR': 693,
         'SIMS PL': 170,
         'SIN MING AVE': 1018,
         'SIN MING RD': 389,
         'SMITH ST': 116,
         'SPOTTISWOODE PK RD': 180,
         "ST. GEORGE'S LANE": 453,
         "ST. GEORGE'S RD": 843,
         'STIRLING RD': 1061,
         'STRATHMORE AVE': 835,
         'TAH CHING RD': 754,
         'TAMAN HO SWEE': 278,
         'TAMPINES AVE 4': 1095,
         'TAMPINES AVE 5': 763,
         'TAMPINES AVE 7': 241,
         'TAMPINES AVE 8': 552,
         'TAMPINES AVE 9': 650,
         'TAMPINES CTRL 1': 146,
         'TAMPINES CTRL 7': 165,
         'TAMPINES ST 11': 1361,
         'TAMPINES ST 12': 1259,
         'TAMPINES ST 21': 3554,
         'TAMPINES ST 22': 2050,
         'TAMPINES ST 23': 987,
         'TAMPINES ST 24': 583,
         'TAMPINES ST 32': 1095,
         'TAMPINES ST 33': 1760,
         'TAMPINES ST 34': 1234,
         'TAMPINES ST 41': 1970,
         'TAMPINES ST 42': 1310,
         'TAMPINES ST 43': 1024,
         'TAMPINES ST 44': 517,
         'TAMPINES ST 45': 1936,
         'TAMPINES ST 71': 2428,
         'TAMPINES ST 72': 1033,
         'TAMPINES ST 81': 1505,
         'TAMPINES ST 82': 1335,
         'TAMPINES ST 83': 1677,
         'TAMPINES ST 84': 675,
         'TAMPINES ST 91': 1050,
         'TANGLIN HALT RD': 1930,
         'TAO CHING RD': 107,
         'TEBAN GDNS RD': 1680,
         'TECK WHYE AVE': 669,
         'TECK WHYE LANE': 2046,
         'TELOK BLANGAH CRES': 1737,
         'TELOK BLANGAH DR': 1317,
         'TELOK BLANGAH HTS': 1069,
         'TELOK BLANGAH RISE': 576,
         'TELOK BLANGAH ST 31': 45,
         'TELOK BLANGAH WAY': 214,
         'TESSENSOHN RD': 42,
         'TG PAGAR PLAZA': 518,
         'TIONG BAHRU RD': 62,
         'TOA PAYOH CTRL': 437,
         'TOA PAYOH EAST': 431,
         'TOA PAYOH NTH': 983,
         'TOH GUAN RD': 1957,
         'TOH YI DR': 724,
         'TOWNER RD': 388,
         'UBI AVE 1': 1901,
         'UPP ALJUNIED LANE': 240,
         'UPP BOON KENG RD': 1001,
         'UPP CROSS ST': 310,
         'UPP SERANGOON RD': 80,
         'VEERASAMY RD': 201,
         'WATERLOO ST': 150,
         'WELLINGTON CIRCLE': 710,
         'WEST COAST DR': 693,
         'WEST COAST RD': 1305,
         'WHAMPOA DR': 1283,
         'WHAMPOA RD': 320,
         'WHAMPOA STH': 46,
         'WHAMPOA WEST': 221,
         'WOODLANDS AVE 1': 1435,
         'WOODLANDS AVE 3': 237,
         'WOODLANDS AVE 4': 1150,
         'WOODLANDS AVE 5': 768,
         'WOODLANDS AVE 6': 2257,
         'WOODLANDS AVE 9': 271,
         'WOODLANDS CIRCLE': 3079,
         'WOODLANDS CRES': 1196,
         'WOODLANDS CTR RD': 108,
         'WOODLANDS DR 14': 2622,
         'WOODLANDS DR 16': 1865,
         'WOODLANDS DR 40': 704,
         'WOODLANDS DR 42': 594,
         'WOODLANDS DR 44': 467,
         'WOODLANDS DR 50': 2383,
         'WOODLANDS DR 52': 745,
         'WOODLANDS DR 53': 403,
         'WOODLANDS DR 60': 1130,
         'WOODLANDS DR 62': 620,
         'WOODLANDS DR 70': 1023,
         'WOODLANDS DR 71': 263,
         'WOODLANDS DR 72': 374,
         'WOODLANDS DR 73': 639,
         'WOODLANDS DR 75': 1310,
         'WOODLANDS RING RD': 3351,
         'WOODLANDS ST 11': 232,
         'WOODLANDS ST 13': 2200,
         'WOODLANDS ST 31': 873,
         'WOODLANDS ST 32': 656,
         'WOODLANDS ST 41': 1978,
         'WOODLANDS ST 81': 909,
         'WOODLANDS ST 82': 1572,
         'WOODLANDS ST 83': 1478,
         'YISHUN AVE 11': 1102,
         'YISHUN AVE 2': 541,
         'YISHUN AVE 3': 724,
         'YISHUN AVE 4': 1239,
         'YISHUN AVE 5': 1892,
         'YISHUN AVE 6': 2029,
         'YISHUN AVE 7': 441,
         'YISHUN AVE 9': 354,
         'YISHUN CTRL': 669,
         'YISHUN CTRL 1': 160,
         'YISHUN RING RD': 8171,
         'YISHUN ST 11': 2349,
         'YISHUN ST 20': 219,
         'YISHUN ST 21': 2305,
         'YISHUN ST 22': 1664,
         'YISHUN ST 41': 59,
         'YISHUN ST 61': 2042,
         'YISHUN ST 71': 1366,
         'YISHUN ST 72': 1643,
         'YISHUN ST 81': 2074,
         'YUAN CHING RD': 94,
         'YUNG AN RD': 300,
         'YUNG HO RD': 150,
         'YUNG KUANG RD': 111,
         'YUNG LOH RD': 265,
         'YUNG PING RD': 311,
         'YUNG SHENG RD': 284,
         'ZION RD': 141})

In [47]:
var = 'lease_commence_date'
data = pd.concat([df_raw['resale_price'], df_raw[var]], axis=1)
data.plot.scatter(x=var, y='resale_price')


Out[47]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a7401b9c50>

In [48]:
var = 'year'
data = pd.concat([df_raw['resale_price'], df_raw[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="resale_price", data=data)
plt.xticks(rotation=90);


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:462: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  box_data = remove_na(group_data)

In [42]:
var = 'street_name'
plt.figure(figsize=(15,15))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a738309128>

In [38]:
var = 'town'
plt.figure(figsize=(8,8))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[38]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a742e40278>

In [39]:
var = 'storey_range'
plt.xticks(rotation = 45)
sns.countplot(df_raw[var])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a74227d710>

In [44]:
var = 'flat_model'
plt.figure(figsize=(8,8))
plt.xticks(rotation = 90)
sns.countplot(df_raw[var])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a73cb9cef0>

Here only few types are palying the role as such.. To name them they are NewGen,Improved,ModelA,Standard,Apart,Simplified,Premium,Maisonette rest we can remove them

Unique values


In [64]:
train_cats(df_raw)
#it's a helper function to aotumate the boring stuffs..
# For Further Insights do a `shift+tab' or a `??train_cats`

In [65]:
df_raw.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 477833 entries, 0 to 477832
Data columns (total 11 columns):
town                   477833 non-null category
flat_type              477833 non-null category
block                  477833 non-null category
street_name            477833 non-null category
storey_range           477833 non-null category
floor_area_sqm         477833 non-null float64
flat_model             477833 non-null category
lease_commence_date    477833 non-null int64
resale_price           477833 non-null float64
month                  477833 non-null category
year                   477833 non-null category
dtypes: category(8), float64(2), int64(1)
memory usage: 15.6 MB

In [66]:
df_raw['month'].cat.codes


Out[66]:
0          0
1          0
2          0
3          0
4          0
5          0
6          0
7          0
8          0
9          0
10         0
11         0
12         0
13         0
14         0
15         0
16         0
17         0
18         0
19         0
20         0
21         0
22         0
23         0
24         0
25         0
26         0
27         0
28         0
29         0
          ..
477803    10
477804    10
477805    10
477806    10
477807    10
477808    10
477809    10
477810    10
477811    10
477812    10
477813    10
477814    10
477815    10
477816    10
477817    10
477818    10
477819    10
477820    10
477821    10
477822    10
477823    10
477824    10
477825    10
477826    10
477827    10
477828    10
477829    10
477830    10
477831    10
477832    10
Length: 477833, dtype: int8

In [68]:
#correlation matrix
corrmat = df_raw.corr()
sns.heatmap(corrmat, vmax=.8, square=True);


RF


In [10]:
df_raw.resale_price = np.log(df_raw.resale_price)

In [11]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)
    return(res)

In [14]:
train_cats(df_raw)

In [15]:
df, y, _ = proc_df(df_raw, 'resale_price')

In [16]:
df.columns


Out[16]:
Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'year', 'age',
       'no_resold'],
      dtype='object')

In [17]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)


Out[17]:
0.99530658728560062

In [18]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 40000
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)
X_train.shape, y_train.shape, X_valid.shape


Out[18]:
((353287, 12), (353287,), (40000, 12))

In [19]:
m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
m.score(df,y)


Out[19]:
0.77697789832166875

In [20]:
draw_tree(m.estimators_[0], X_train, precision=3)


Tree 0 floor_area_sqm ≤ 84.5 mse = 0.164 samples = 353287 value = 12.487 1 month ≤ 100.5 mse = 0.121 samples = 131719 value = 12.172 0->1 True 8 month ≤ 101.5 mse = 0.096 samples = 221568 value = 12.675 0->8 False 2 floor_area_sqm ≤ 74.5 mse = 0.052 samples = 91114 value = 11.995 1->2 5 month ≤ 126.5 mse = 0.049 samples = 40605 value = 12.569 1->5 3 mse = 0.045 samples = 72021 value = 11.945 2->3 4 mse = 0.035 samples = 19093 value = 12.18 2->4 6 mse = 0.034 samples = 20325 value = 12.433 5->6 7 mse = 0.026 samples = 20280 value = 12.706 5->7 9 flat_type ≤ 4.5 mse = 0.067 samples = 149709 value = 12.55 8->9 12 month ≤ 127.5 mse = 0.055 samples = 71859 value = 12.937 8->12 10 mse = 0.032 samples = 83299 value = 12.393 9->10 11 mse = 0.041 samples = 66410 value = 12.747 9->11 13 mse = 0.044 samples = 40258 value = 12.835 12->13 14 mse = 0.039 samples = 31601 value = 13.066 12->14

In [21]:
fi = rf_feat_importance(m, df); fi[:10]


Out[21]:
cols imp
6 floor_area_sqm 0.487445
0 month 0.408782
2 flat_type 0.103774
1 town 0.000000
3 block 0.000000
4 street_name 0.000000
5 storey_range 0.000000
7 flat_model 0.000000
8 lease_commence_date 0.000000
9 year 0.000000

In [22]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False);



In [23]:
def plot_fi(fi): 
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [24]:
plot_fi(fi[:10]);



In [25]:
from scipy.cluster import hierarchy as hc

In [26]:
corr = np.round(scipy.stats.spearmanr(df).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(10,10))
dendrogram = hc.dendrogram(z, labels=df.columns, orientation='left', leaf_font_size=16)
plt.show()


Partial Dependence


In [28]:
from pdpbox import pdp
from plotnine import *

In [29]:
set_rf_samples(10000)

In [30]:
df_trn2, y_trn, _ = proc_df(df_raw, 'resale_price', max_n_cat=20)
X_train, X_valid = split_vals(df_trn2, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.6, n_jobs=-1)
m.fit(X_train, y_train);

In [31]:
plot_fi(rf_feat_importance(m, df_trn2)[:20]);



In [32]:
x = get_sample(X_train, 500)

In [43]:
def plot_pdp(feat, clusters=None, feat_name=None):
    feat_name = feat_name or feat
    p = pdp.pdp_isolate(m, x, feat)
    return pdp.pdp_plot(p, feat_name, plot_lines=True, 
                        cluster=clusters is not None, n_cluster_centers=clusters)

In [48]:
df_raw.resale_price = np.exp(df_raw.resale_price)

In [49]:
x_all = get_sample(df_raw, 500)

In [50]:
ggplot(x_all, aes('year', 'resale_price'))+stat_smooth(se=True, method='loess')


Out[50]:
<ggplot: (91195899298)>

In [51]:
plot_pdp('year')



In [53]:
plot_pdp('year', clusters=5)



In [52]:
plot_pdp('floor_area_sqm', clusters=5)



In [120]:
feats = ['year','lease_commence_date']
p = pdp.pdp_interact(m, x, feats)
pdp.pdp_interact_plot(p, feats)


Tree Interpreter


In [121]:
from treeinterpreter import treeinterpreter as ti

In [124]:
fi = rf_feat_importance(m, x); fi[:15]


Out[124]:
cols imp
8 year 0.427701
4 floor_area_sqm 0.289395
11 flat_type_3 ROOM 0.067152
12 flat_type_4 ROOM 0.041010
6 lease_commence_date 0.037875
0 town 0.037752
2 street_name 0.022137
1 block 0.020883
14 flat_type_EXECUTIVE 0.014993
5 flat_model 0.011436
3 storey_range 0.009497
13 flat_type_5 ROOM 0.009240
7 month 0.005975
10 flat_type_2 ROOM 0.003909
9 flat_type_1 ROOM 0.001027

In [125]:
feats=['flat_type_3 ROOM','flat_type_4 ROOM']

In [126]:
(X_train[feats]/1000).describe()


Out[126]:
flat_type_3 ROOM flat_type_4 ROOM
count 437833.000000 437833.000000
mean 0.000307 0.000385
std 0.000461 0.000487
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.001000 0.001000
max 0.001000 0.001000

In [127]:
(X_valid[feats]/1000).describe()


Out[127]:
flat_type_3 ROOM flat_type_4 ROOM
count 40000.000000 40000.000000
mean 0.000255 0.000415
std 0.000436 0.000493
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 0.000000
75% 0.001000 0.001000
max 0.001000 0.001000

In [128]:
x.drop(feats, axis=1, inplace=True)

29th Jan


In [37]:
from collections import Counter
z = Counter(df_raw['street_name'])

In [42]:
for k, v in z.items():
    print("%s: %s" % (k, v))


ANG MO KIO AVE 1: 2085
ANG MO KIO AVE 10: 6656
ANG MO KIO AVE 2: 107
ANG MO KIO AVE 3: 5830
ANG MO KIO AVE 4: 3651
ANG MO KIO AVE 5: 3108
ANG MO KIO AVE 6: 1236
ANG MO KIO AVE 8: 880
ANG MO KIO AVE 9: 395
ANG MO KIO ST 11: 77
ANG MO KIO ST 21: 90
ANG MO KIO ST 31: 235
BEDOK NTH AVE 1: 1123
BEDOK NTH AVE 2: 1099
BEDOK NTH AVE 3: 1010
BEDOK NTH AVE 4: 1197
BEDOK NTH RD: 3312
BEDOK NTH ST 1: 800
BEDOK NTH ST 2: 1322
BEDOK NTH ST 3: 3659
BEDOK NTH ST 4: 271
BEDOK RESERVOIR RD: 6326
BEDOK STH AVE 1: 637
BEDOK STH AVE 2: 996
BEDOK STH AVE 3: 648
BEDOK STH RD: 1254
CHAI CHEE AVE: 580
CHAI CHEE DR: 337
CHAI CHEE RD: 971
CHAI CHEE ST: 1021
JLN TENAGA: 889
LENGKONG TIGA: 409
NEW UPP CHANGI RD: 1486
BISHAN ST 11: 367
BISHAN ST 12: 1923
BISHAN ST 13: 2095
BISHAN ST 22: 1398
BISHAN ST 23: 1053
BISHAN ST 24: 1017
BRIGHT HILL DR: 142
SHUNFU RD: 658
SIN MING AVE: 1018
SIN MING RD: 389
BT BATOK CTRL: 1297
BT BATOK EAST AVE 3: 817
BT BATOK EAST AVE 4: 1234
BT BATOK EAST AVE 5: 1760
BT BATOK ST 11: 660
BT BATOK ST 21: 1556
BT BATOK ST 31: 1572
BT BATOK ST 32: 815
BT BATOK ST 33: 832
BT BATOK ST 34: 1049
BT BATOK ST 51: 506
BT BATOK ST 52: 1553
BT BATOK WEST AVE 2: 516
BT BATOK WEST AVE 4: 839
BT BATOK WEST AVE 6: 2483
BT BATOK WEST AVE 8: 1209
BEO CRES: 236
BT MERAH CTRL: 155
BT MERAH VIEW: 981
BT PURMEI RD: 970
DELTA AVE: 219
DEPOT RD: 545
EVERTON PK: 250
HAVELOCK RD: 343
HENDERSON CRES: 417
HOY FATT RD: 213
JLN BT HO SWEE: 318
JLN BT MERAH: 1820
JLN KLINIK: 290
KIM TIAN PL: 254
KIM TIAN RD: 658
REDHILL CL: 748
SPOTTISWOODE PK RD: 180
TAMAN HO SWEE: 278
TELOK BLANGAH CRES: 1737
TELOK BLANGAH DR: 1317
TELOK BLANGAH HTS: 1069
TELOK BLANGAH RISE: 576
TELOK BLANGAH WAY: 214
ZION RD: 141
BANGKIT RD: 1003
BT PANJANG RING RD: 2124
FAJAR RD: 1177
GANGSA RD: 1345
PENDING RD: 1478
PETIR RD: 1734
SAUJANA RD: 234
EMPRESS RD: 120
QUEEN'S RD: 240
TOH YI DR: 724
CHANDER RD: 57
JLN KUKOH: 190
KELANTAN RD: 154
KRETA AYER RD: 39
QUEEN ST: 139
ROCHOR RD: 168
ROWELL RD: 189
SAGO LANE: 118
TG PAGAR PLAZA: 518
VEERASAMY RD: 201
CHOA CHU KANG AVE 1: 202
CHOA CHU KANG AVE 2: 1135
CHOA CHU KANG AVE 3: 1840
CHOA CHU KANG AVE 4: 2603
CHOA CHU KANG CTRL: 1577
CHOA CHU KANG NTH 5: 631
CHOA CHU KANG NTH 6: 1088
CHOA CHU KANG NTH 7: 170
CHOA CHU KANG ST 51: 963
CHOA CHU KANG ST 52: 827
CHOA CHU KANG ST 53: 397
CHOA CHU KANG ST 62: 1316
TECK WHYE AVE: 669
TECK WHYE LANE: 2046
CLEMENTI AVE 1: 406
CLEMENTI AVE 2: 1780
CLEMENTI AVE 3: 1297
CLEMENTI AVE 4: 2022
CLEMENTI AVE 5: 1475
CLEMENTI AVE 6: 249
CLEMENTI ST 13: 421
CLEMENTI WEST ST 1: 1060
CLEMENTI WEST ST 2: 1797
WEST COAST DR: 693
WEST COAST RD: 1305
ALJUNIED CRES: 1164
ALJUNIED RD: 70
BALAM RD: 1049
CIRCUIT RD: 2891
DAKOTA CRES: 180
EUNOS CRES: 1462
EUNOS RD 5: 245
GEYLANG EAST AVE 1: 459
GEYLANG EAST CTRL: 116
GEYLANG SERAI: 110
HAIG RD: 801
JLN PASAR BARU: 36
MACPHERSON LANE: 224
PAYA LEBAR WAY: 498
PINE CL: 381
SIMS DR: 693
SIMS PL: 170
UBI AVE 1: 1901
HOUGANG AVE 1: 2157
HOUGANG AVE 10: 1946
HOUGANG AVE 2: 757
HOUGANG AVE 3: 1666
HOUGANG AVE 4: 1123
HOUGANG AVE 5: 1831
HOUGANG AVE 6: 1184
HOUGANG AVE 7: 746
HOUGANG AVE 8: 4667
HOUGANG CTRL: 685
HOUGANG ST 11: 448
HOUGANG ST 21: 355
HOUGANG ST 22: 522
HOUGANG ST 51: 1653
HOUGANG ST 52: 532
HOUGANG ST 61: 825
LOR AH SOO: 882
JURONG EAST AVE 1: 689
JURONG EAST ST 13: 991
JURONG EAST ST 21: 2717
JURONG EAST ST 24: 1589
JURONG EAST ST 31: 589
JURONG EAST ST 32: 1346
PANDAN GDNS: 880
TEBAN GDNS RD: 1680
BOON LAY AVE: 1250
BOON LAY DR: 1267
BOON LAY PL: 1876
HO CHING RD: 695
JURONG WEST AVE 1: 1653
JURONG WEST AVE 5: 521
JURONG WEST ST 41: 2015
JURONG WEST ST 42: 2757
JURONG WEST ST 52: 2380
JURONG WEST ST 71: 788
JURONG WEST ST 72: 117
JURONG WEST ST 73: 672
JURONG WEST ST 74: 714
JURONG WEST ST 75: 206
JURONG WEST ST 81: 2836
JURONG WEST ST 91: 1876
JURONG WEST ST 92: 688
YUNG AN RD: 300
YUNG KUANG RD: 111
AH HOOD RD: 75
BEACH RD: 330
BENDEMEER RD: 860
CRAWFORD LANE: 228
GEYLANG BAHRU: 1504
GLOUCESTER RD: 149
JLN BAHAGIA: 799
JLN BATU: 662
JLN RAJAH: 372
KALLANG BAHRU: 534
KENT RD: 127
KING GEORGE'S AVE: 145
LOR LIMAU: 271
LOR 3 GEYLANG: 78
MCNAIR RD: 680
RACE COURSE RD: 192
ST. GEORGE'S LANE: 453
ST. GEORGE'S RD: 843
TOWNER RD: 388
UPP BOON KENG RD: 1001
WHAMPOA DR: 1283
WHAMPOA RD: 320
WHAMPOA WEST: 221
MARINE CRES: 1158
MARINE DR: 1160
MARINE TER: 1383
ELIAS RD: 578
PASIR RIS DR 1: 635
PASIR RIS DR 10: 877
PASIR RIS DR 3: 456
PASIR RIS DR 4: 1040
PASIR RIS DR 6: 1877
PASIR RIS ST 11: 1934
PASIR RIS ST 12: 911
PASIR RIS ST 13: 406
PASIR RIS ST 21: 2068
PASIR RIS ST 41: 110
PASIR RIS ST 51: 1124
PASIR RIS ST 52: 879
PASIR RIS ST 53: 700
PASIR RIS ST 71: 3107
PASIR RIS ST 72: 671
C'WEALTH CL: 951
C'WEALTH CRES: 750
C'WEALTH DR: 1619
CLARENCE LANE: 212
DOVER CL EAST: 154
DOVER CRES: 780
DOVER RD: 351
GHIM MOH RD: 1387
HOLLAND AVE: 937
HOLLAND CL: 965
HOLLAND DR: 840
MEI LING ST: 956
STIRLING RD: 1061
STRATHMORE AVE: 835
TANGLIN HALT RD: 1930
SELETAR WEST FARMWAY 6: 8
LOR LEW LIAN: 534
SERANGOON AVE 1: 350
SERANGOON AVE 2: 1285
SERANGOON AVE 3: 1249
SERANGOON AVE 4: 1393
SERANGOON CTRL: 1244
SERANGOON NTH AVE 1: 1953
SERANGOON NTH AVE 2: 267
SERANGOON NTH AVE 4: 1923
SIMEI RD: 464
SIMEI ST 1: 2582
SIMEI ST 2: 260
SIMEI ST 4: 634
TAMPINES AVE 4: 1095
TAMPINES AVE 5: 763
TAMPINES AVE 7: 241
TAMPINES AVE 8: 552
TAMPINES AVE 9: 650
TAMPINES CTRL 1: 146
TAMPINES ST 11: 1361
TAMPINES ST 12: 1259
TAMPINES ST 21: 3554
TAMPINES ST 22: 2050
TAMPINES ST 23: 987
TAMPINES ST 24: 583
TAMPINES ST 32: 1095
TAMPINES ST 33: 1760
TAMPINES ST 34: 1234
TAMPINES ST 41: 1970
TAMPINES ST 42: 1310
TAMPINES ST 43: 1024
TAMPINES ST 44: 517
TAMPINES ST 45: 1936
TAMPINES ST 81: 1505
TAMPINES ST 82: 1335
TAMPINES ST 83: 1677
TAMPINES ST 84: 675
TAMPINES ST 91: 1050
JOO SENG RD: 446
KIM KEAT AVE: 636
KIM KEAT LINK: 300
LOR 1 TOA PAYOH: 2657
LOR 2 TOA PAYOH: 1463
LOR 3 TOA PAYOH: 228
LOR 4 TOA PAYOH: 1321
LOR 5 TOA PAYOH: 1220
LOR 6 TOA PAYOH: 261
LOR 7 TOA PAYOH: 1534
LOR 8 TOA PAYOH: 1566
POTONG PASIR AVE 1: 1328
POTONG PASIR AVE 2: 168
POTONG PASIR AVE 3: 343
TOA PAYOH CTRL: 437
TOA PAYOH EAST: 431
TOA PAYOH NTH: 983
UPP ALJUNIED LANE: 240
MARSILING DR: 3336
MARSILING LANE: 828
MARSILING RD: 296
MARSILING RISE: 790
WOODLANDS AVE 1: 1435
WOODLANDS AVE 4: 1150
WOODLANDS AVE 9: 271
WOODLANDS ST 11: 232
WOODLANDS ST 13: 2200
WOODLANDS ST 31: 873
WOODLANDS ST 32: 656
WOODLANDS ST 41: 1978
WOODLANDS ST 81: 909
WOODLANDS ST 82: 1572
WOODLANDS ST 83: 1478
YISHUN AVE 11: 1102
YISHUN AVE 2: 541
YISHUN AVE 3: 724
YISHUN AVE 4: 1239
YISHUN AVE 5: 1892
YISHUN AVE 6: 2029
YISHUN AVE 7: 441
YISHUN AVE 9: 354
YISHUN CTRL: 669
YISHUN CTRL 1: 160
YISHUN RING RD: 8171
YISHUN ST 11: 2349
YISHUN ST 21: 2305
YISHUN ST 22: 1664
YISHUN ST 61: 2042
YISHUN ST 71: 1366
YISHUN ST 72: 1643
YISHUN ST 81: 2074
JLN RUMAH TINGGI: 598
LENGKOK BAHRU: 457
SILAT AVE: 91
CASHEW RD: 293
BAIN ST: 90
NEW MKT RD: 56
SELEGIE RD: 253
CHOA CHU KANG LOOP: 198
C'WEALTH AVE WEST: 357
CASSIA CRES: 491
OLD AIRPORT RD: 119
UPP SERANGOON RD: 80
JURONG WEST ST 93: 644
YUNG HO RD: 150
YUNG PING RD: 311
CAMBRIDGE RD: 260
DORSET RD: 189
JLN MA'MOR: 99
OWEN RD: 73
WHAMPOA STH: 46
C'WEALTH AVE: 187
MARGARET DR: 13
QUEEN'S CL: 379
SERANGOON CTRL DR: 340
MARSILING CRES: 324
WOODLANDS AVE 3: 237
WOODLANDS CTR RD: 108
JLN DAMAI: 508
INDUS RD: 149
TIONG BAHRU RD: 62
FARRER RD: 92
BUFFALO RD: 146
UPP CROSS ST: 310
WATERLOO ST: 150
CLEMENTI ST 14: 117
JLN DUA: 112
JOO CHIAT RD: 112
PIPIT RD: 312
YUNG LOH RD: 265
BOON KENG RD: 350
NTH BRIDGE RD: 157
TESSENSOHN RD: 42
HENDERSON RD: 78
CHIN SWEE RD: 48
JLN BERSEH: 29
CHOA CHU KANG DR: 646
CHOA CHU KANG ST 54: 429
KG KAYU RD: 52
QUEENSWAY: 119
BT BATOK WEST AVE 7: 159
REDHILL LANE: 159
JURONG WEST ST 51: 45
FRENCH RD: 59
LIM LIAK ST: 80
SIMS AVE: 92
KANG CHING RD: 882
TAH CHING RD: 754
JLN DUSUN: 62
KG ARANG RD: 106
SIMEI ST 5: 403
WOODLANDS AVE 5: 768
WOODLANDS DR 50: 2383
EAST COAST RD: 45
KIM PONG RD: 23
TAO CHING RD: 107
MOULMEIN RD: 38
WOODLANDS DR 40: 704
BT MERAH LANE 1: 25
KG BAHRU HILL: 18
MOH GUAN TER: 74
SMITH ST: 116
WOODLANDS DR 70: 1023
KIM CHENG ST: 49
TAMPINES ST 71: 2428
WOODLANDS DR 42: 594
JLN TENTERAM: 226
SERANGOON NTH AVE 3: 860
JELAPANG RD: 2669
BT BATOK ST 22: 168
HOUGANG ST 91: 1896
WOODLANDS AVE 6: 2257
TAMPINES ST 72: 1033
WOODLANDS CIRCLE: 3079
CORPORATION DR: 390
LOMPANG RD: 582
WOODLANDS DR 72: 374
SENG POH RD: 14
CHOA CHU KANG ST 64: 504
BT BATOK ST 24: 882
JLN TECK WHYE: 996
WOODLANDS CRES: 1196
WOODLANDS DR 60: 1130
CHANGI VILLAGE RD: 139
BT BATOK ST 25: 1030
HOUGANG AVE 9: 1066
JURONG WEST CTRL 1: 1667
WOODLANDS RING RD: 3351
CHOA CHU KANG AVE 5: 829
TOH GUAN RD: 1957
JURONG WEST ST 61: 2458
WOODLANDS DR 14: 2622
HOUGANG ST 92: 571
CHOA CHU KANG CRES: 3064
CANBERRA RD: 1510
SEMBAWANG CL: 1340
SEMBAWANG CRES: 444
SEMBAWANG VISTA: 328
COMPASSVALE WALK: 1312
RIVERVALE ST: 854
WOODLANDS DR 62: 620
SEMBAWANG DR: 1573
WOODLANDS DR 52: 745
WOODLANDS DR 53: 403
RIVERVALE WALK: 808
COMPASSVALE LANE: 675
RIVERVALE DR: 2495
SENJA RD: 1788
JURONG WEST ST 65: 2923
RIVERVALE CRES: 1463
WOODLANDS DR 44: 467
COMPASSVALE DR: 617
WOODLANDS DR 16: 1865
COMPASSVALE RD: 1090
WOODLANDS DR 73: 639
HOUGANG ST 31: 213
JURONG WEST ST 64: 1983
GEYLANG EAST AVE 2: 5
WOODLANDS DR 71: 263
YISHUN ST 20: 219
ADMIRALTY DR: 1763
COMPASSVALE ST: 1377
BEDOK RESERVOIR VIEW: 868
YUNG SHENG RD: 284
ADMIRALTY LINK: 1062
SENGKANG EAST WAY: 964
ANG MO KIO ST 32: 369
ANG MO KIO ST 52: 299
BOON TIONG RD: 262
JURONG WEST ST 62: 199
ANCHORVALE LINK: 1253
CANBERRA LINK: 160
COMPASSVALE CRES: 473
CLEMENTI ST 12: 33
MONTREAL DR: 469
WELLINGTON CIRCLE: 710
SENGKANG EAST RD: 228
JURONG WEST AVE 3: 326
ANCHORVALE LANE: 298
REDHILL RD: 282
SENJA LINK: 168
EDGEFIELD PLAINS: 1522
ANCHORVALE DR: 699
SEGAR RD: 1113
FARRER PK RD: 276
EDGEDALE PLAINS: 1402
PUNGGOL FIELD: 1971
ANCHORVALE RD: 1085
CANTONMENT CL: 462
JLN MEMBINA: 668
FERNVALE LANE: 387
JURONG WEST ST 25: 401
CLEMENTI ST 11: 31
PUNGGOL FIELD WALK: 198
KLANG LANE: 141
PUNGGOL CTRL: 1648
JELEBU RD: 343
BUANGKOK CRES: 604
WOODLANDS DR 75: 1310
BT BATOK WEST AVE 5: 871
JELLICOE RD: 116
PUNGGOL DR: 959
JURONG WEST ST 24: 415
SEMBAWANG WAY: 10
FERNVALE RD: 1132
BUANGKOK LINK: 199
FERNVALE LINK: 191
JLN TIGA: 94
YUAN CHING RD: 94
COMPASSVALE LINK: 786
LOWER DELTA RD: 47
MARINE PARADE CTRL: 2
COMPASSVALE BOW: 190
PUNGGOL RD: 256
BEDOK CTRL: 168
PUNGGOL EAST: 110
SENGKANG CTRL: 235
TAMPINES CTRL 7: 165
SENGKANG WEST AVE: 113
PUNGGOL PL: 303
CANTONMENT RD: 255
GHIM MOH LINK: 73
SIMEI LANE: 68
YISHUN ST 41: 59
TELOK BLANGAH ST 31: 45
JLN KAYU: 26
LOR 1A TOA PAYOH: 86
PUNGGOL WALK: 21

In [43]:
from collections import OrderedDict
z_sorted_by_value = OrderedDict(sorted(z.items(), key=lambda x: x[1], reverse= True))

In [77]:
df_count_street = pd.DataFrame(z_sorted_by_value, index = ['freq']);df_count_street = df_count_street.T;df_count_street


Out[77]:
freq
YISHUN RING RD 8171
ANG MO KIO AVE 10 6656
BEDOK RESERVOIR RD 6326
ANG MO KIO AVE 3 5830
HOUGANG AVE 8 4667
BEDOK NTH ST 3 3659
ANG MO KIO AVE 4 3651
TAMPINES ST 21 3554
WOODLANDS RING RD 3351
MARSILING DR 3336
BEDOK NTH RD 3312
ANG MO KIO AVE 5 3108
PASIR RIS ST 71 3107
WOODLANDS CIRCLE 3079
CHOA CHU KANG CRES 3064
JURONG WEST ST 65 2923
CIRCUIT RD 2891
JURONG WEST ST 81 2836
JURONG WEST ST 42 2757
JURONG EAST ST 21 2717
JELAPANG RD 2669
LOR 1 TOA PAYOH 2657
WOODLANDS DR 14 2622
CHOA CHU KANG AVE 4 2603
SIMEI ST 1 2582
RIVERVALE DR 2495
BT BATOK WEST AVE 6 2483
JURONG WEST ST 61 2458
TAMPINES ST 71 2428
WOODLANDS DR 50 2383
... ...
FRENCH RD 59
YISHUN ST 41 59
CHANDER RD 57
NEW MKT RD 56
KG KAYU RD 52
KIM CHENG ST 49
CHIN SWEE RD 48
LOWER DELTA RD 47
WHAMPOA STH 46
JURONG WEST ST 51 45
EAST COAST RD 45
TELOK BLANGAH ST 31 45
TESSENSOHN RD 42
KRETA AYER RD 39
MOULMEIN RD 38
JLN PASAR BARU 36
CLEMENTI ST 12 33
CLEMENTI ST 11 31
JLN BERSEH 29
JLN KAYU 26
BT MERAH LANE 1 25
KIM PONG RD 23
PUNGGOL WALK 21
KG BAHRU HILL 18
SENG POH RD 14
MARGARET DR 13
SEMBAWANG WAY 10
SELETAR WEST FARMWAY 6 8
GEYLANG EAST AVE 2 5
MARINE PARADE CTRL 2

534 rows × 1 columns


In [28]:
df_raw


Out[28]:
year month flat_type street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price Age
0 2000 1 3 ANG MO KIO AVE 1 07 TO 09 73.0 New Generation 1976 159000.0 24
1 2000 1 3 ANG MO KIO AVE 1 07 TO 09 73.0 New Generation 1976 167000.0 24
2 2000 1 3 ANG MO KIO AVE 1 07 TO 09 67.0 New Generation 1976 163000.0 24
3 2000 1 4 ANG MO KIO AVE 1 04 TO 06 97.0 New Generation 1977 260000.0 23
4 2000 1 5 ANG MO KIO AVE 1 10 TO 12 127.0 Improved 1993 408000.0 7
5 2000 1 4 ANG MO KIO AVE 1 07 TO 09 112.0 Model A 1993 315000.0 7
6 2000 1 3 ANG MO KIO AVE 1 04 TO 06 73.0 New Generation 1977 157000.0 23
7 2000 1 3 ANG MO KIO AVE 1 07 TO 09 73.0 New Generation 1977 178000.0 23
8 2000 1 5 ANG MO KIO AVE 1 13 TO 15 123.0 Standard 1977 374000.0 23
9 2000 1 5 ANG MO KIO AVE 1 01 TO 03 123.0 Standard 1977 360000.0 23
10 2000 1 4 ANG MO KIO AVE 1 04 TO 06 91.0 New Generation 1978 247000.0 22
11 2000 1 3 ANG MO KIO AVE 1 07 TO 09 68.0 New Generation 1981 160000.0 19
12 2000 1 3 ANG MO KIO AVE 1 04 TO 06 68.0 New Generation 1981 169000.0 19
13 2000 1 3 ANG MO KIO AVE 1 07 TO 09 82.0 New Generation 1981 205000.0 19
14 2000 1 3 ANG MO KIO AVE 1 01 TO 03 68.0 New Generation 1981 155000.0 19
15 2000 1 3 ANG MO KIO AVE 1 04 TO 06 82.0 New Generation 1981 205000.0 19
16 2000 1 4 ANG MO KIO AVE 1 07 TO 09 91.0 New Generation 1982 280000.0 18
17 2000 1 4 ANG MO KIO AVE 1 10 TO 12 91.0 New Generation 1982 255000.0 18
18 2000 1 5 ANG MO KIO AVE 10 04 TO 06 119.0 Improved 1979 375000.0 21
19 2000 1 3 ANG MO KIO AVE 10 10 TO 12 67.0 New Generation 1979 161000.0 21
20 2000 1 4 ANG MO KIO AVE 10 04 TO 06 99.0 New Generation 1979 255000.0 21
21 2000 1 3 ANG MO KIO AVE 10 13 TO 15 74.0 New Generation 1979 178000.0 21
22 2000 1 3 ANG MO KIO AVE 10 10 TO 12 74.0 New Generation 1979 158000.0 21
23 2000 1 3 ANG MO KIO AVE 10 13 TO 15 74.0 New Generation 1979 160000.0 21
24 2000 1 3 ANG MO KIO AVE 10 10 TO 12 74.0 New Generation 1979 160000.0 21
25 2000 1 3 ANG MO KIO AVE 10 04 TO 06 74.0 New Generation 1979 159000.0 21
26 2000 1 3 ANG MO KIO AVE 10 04 TO 06 74.0 New Generation 1979 155000.0 21
27 2000 1 3 ANG MO KIO AVE 10 04 TO 06 67.0 New Generation 1979 176000.0 21
28 2000 1 3 ANG MO KIO AVE 10 07 TO 09 67.0 New Generation 1979 166000.0 21
29 2000 1 4 ANG MO KIO AVE 10 01 TO 03 92.0 New Generation 1979 252000.0 21
... ... ... ... ... ... ... ... ... ... ...
477803 2017 11 4 YISHUN ST 61 10 TO 12 84.0 Simplified 1986 308000.0 31
477804 2017 11 4 YISHUN ST 61 07 TO 09 102.0 Model A 1987 335000.0 30
477805 2017 11 4 YISHUN ST 61 04 TO 06 84.0 Simplified 1986 345000.0 31
477806 2017 11 4 YISHUN ST 61 01 TO 03 107.0 Model A 1988 340000.0 29
477807 2017 11 4 YISHUN ST 61 04 TO 06 85.0 Simplified 1987 302000.0 30
477808 2017 11 4 YISHUN ST 71 04 TO 06 84.0 Simplified 1985 310000.0 32
477809 2017 11 4 YISHUN ST 71 04 TO 06 104.0 Model A 1985 330000.0 32
477810 2017 11 4 YISHUN ST 72 01 TO 03 90.0 Simplified 1985 318000.0 32
477811 2017 11 4 YISHUN ST 72 07 TO 09 84.0 Simplified 1987 328000.0 30
477812 2017 11 4 YISHUN ST 72 04 TO 06 84.0 Simplified 1985 310000.0 32
477813 2017 11 4 YISHUN ST 81 01 TO 03 104.0 Model A 1987 340000.0 30
477814 2017 11 4 YISHUN ST 81 07 TO 09 103.0 Model A 1988 420000.0 29
477815 2017 11 5 YISHUN AVE 11 07 TO 09 121.0 Improved 1988 438000.0 29
477816 2017 11 5 YISHUN AVE 2 01 TO 03 121.0 Improved 1987 425000.0 30
477817 2017 11 5 YISHUN AVE 3 07 TO 09 122.0 Improved 1986 460000.0 31
477818 2017 11 5 YISHUN AVE 4 07 TO 09 121.0 Improved 1987 408000.0 30
477819 2017 11 5 YISHUN RING RD 04 TO 06 127.0 Improved 1988 408000.0 29
477820 2017 11 5 YISHUN RING RD 07 TO 09 122.0 Improved 1988 460000.0 29
477821 2017 11 5 YISHUN RING RD 07 TO 09 122.0 Improved 1987 550000.0 30
477822 2017 11 5 YISHUN ST 11 01 TO 03 121.0 Improved 1985 385000.0 32
477823 2017 11 5 YISHUN ST 21 10 TO 12 121.0 Improved 1985 480000.0 32
477824 2017 11 5 YISHUN ST 81 04 TO 06 122.0 Improved 1988 490000.0 29
477825 2017 11 5 YISHUN ST 81 07 TO 09 122.0 Improved 1988 548000.0 29
477826 2017 11 EXECUTIVE YISHUN AVE 6 01 TO 03 142.0 Apartment 1988 465000.0 29
477827 2017 11 EXECUTIVE YISHUN ST 61 01 TO 03 142.0 Apartment 1987 515000.0 30
477828 2017 11 EXECUTIVE YISHUN ST 61 10 TO 12 164.0 Apartment 1992 728000.0 25
477829 2017 11 EXECUTIVE YISHUN ST 81 01 TO 03 154.0 Maisonette 1987 648888.0 30
477830 2017 11 EXECUTIVE YISHUN ST 81 07 TO 09 142.0 Apartment 1988 658000.0 29
477831 2017 11 EXECUTIVE YISHUN ST 81 04 TO 06 142.0 Apartment 1987 598000.0 30
477832 2017 11 MULTI-GENERATION YISHUN AVE 4 01 TO 03 179.0 Multi Generation 1987 695000.0 30

477833 rows × 10 columns

Bi Variate plots on train.csv


In [18]:
new_df = pd.read_csv(f'{PATH}train.csv')

In [19]:
new_df.head(1)


Out[19]:
town flat_type block street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price building_id month year age no_times_resold
0 ANG MO KIO 3 ROOM 216 ANG MO KIO AVE 1 07 TO 09 73.0 New Generation 1976 159000.0 ANG MO KIO_216_ANG MO KIO AVE 1_07 TO 09_New G... 1 2000 24 33

In [20]:
new_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416761 entries, 0 to 416760
Data columns (total 14 columns):
town                   416761 non-null object
flat_type              416761 non-null object
block                  416761 non-null object
street_name            416761 non-null object
storey_range           416761 non-null object
floor_area_sqm         416761 non-null float64
flat_model             416761 non-null object
lease_commence_date    416761 non-null int64
resale_price           416761 non-null float64
building_id            416761 non-null object
month                  416761 non-null int64
year                   416761 non-null int64
age                    416761 non-null int64
no_times_resold        416761 non-null int64
dtypes: float64(2), int64(5), object(7)
memory usage: 44.5+ MB

In [20]:
new_df.head(1)


Out[20]:
town flat_type block street_name storey_range floor_area_sqm flat_model lease_commence_date resale_price building_id month year age no_times_resold
0 ANG MO KIO 3 ROOM 216 ANG MO KIO AVE 1 07 TO 09 73.0 New Generation 1976 159000.0 ANG MO KIO_216_ANG MO KIO AVE 1_07 TO 09_New G... 1 2000 24 33

In [21]:
plt.figure(figsize=(8,6))
plt.scatter(range(new_df.shape[0]), np.sort(new_df.resale_price.values))
plt.xlabel('index', fontsize=12)
plt.ylabel('price', fontsize=12)
##if needed, one can truncate the high values.


Out[21]:
Text(0,0.5,'price')

In [22]:
plt.figure(figsize=(12,8))
sns.distplot(new_df.resale_price.values, bins=50, kde=True)
plt.xlabel('price', fontsize=12)
plt.show()
#Certainly a long right tail.let us plot the log of resale_price variable.



In [23]:
plt.figure(figsize=(12,8))
sns.distplot(np.log(new_df.resale_price.values), bins=50, kde=True)
plt.xlabel('price', fontsize=12)


Out[23]:
Text(0.5,0,'price')

Now let us see how the median housing price change with time.


In [24]:
grouped_df = new_df.groupby('year')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(8,8))
sns.barplot(grouped_df.year.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Year', fontsize=12)
plt.xticks(rotation=45)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[24]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17]),
 <a list of 18 Text xticklabel objects>)

In [25]:
grouped_df = new_df.groupby('month')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(12,8))
sns.barplot(grouped_df.month.values, grouped_df.resale_price.values, alpha=0.8, color=color[5])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Month', fontsize=12)
plt.xticks(rotation='vertical')


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[25]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]),
 <a list of 12 Text xticklabel objects>)

In [26]:
grouped_df = new_df.groupby('no_times_resold')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(15,15))
sns.barplot(grouped_df.no_times_resold.values, grouped_df.resale_price.values, alpha=0.8, color=color[4])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('no_times_resold', fontsize=12)
plt.xticks(rotation='vertical')


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[26]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
        26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
        52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76]),
 <a list of 77 Text xticklabel objects>)

In [27]:
plt.figure(figsize=(25,25))
plt.xlabel('floor_area_sqm', fontsize=16)
plt.ylabel('resale_price', fontsize=16)
new_df.groupby('floor_area_sqm')['resale_price'].mean().plot(kind='area')


Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fdfe1162b0>

In [28]:
plt.figure(figsize=(10,10))
plt.xticks(rotation=45)
new_df.groupby('lease_commence_date')['resale_price'].mean().plot(kind='bar')


Out[28]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fdf954a0f0>

In [29]:
plt.figure(figsize=(12,12))
plt.xticks(rotation = 'vertical')
sns.boxplot(x="no_times_resold", y="age", data=new_df)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:462: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  box_data = remove_na(group_data)
Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fdfafbf1d0>

In [30]:
plt.figure(figsize=(12,12))
plt.xticks(rotation = 'vertical')
sns.boxplot(x="no_times_resold", y="resale_price", data=new_df)


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:462: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  box_data = remove_na(group_data)
Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fdf353b710>

In [39]:
figbi, axesbi = plt.subplots(2, 4, figsize=(25, 25))
new_df.groupby('floor_area_sqm')['resale_price'].mean().plot(kind='barh',ax=axesbi[0,0])
new_df.groupby('lease_commence_date')['resale_price'].mean().plot(kind='barh',ax=axesbi[0,1])
new_df.groupby('no_times_resold')['resale_price'].mean().plot(kind='barh',ax=axesbi[0,2])
new_df.groupby('month')['resale_price'].mean().plot(kind='barh',ax=axesbi[0,3])
new_df.groupby('flat_type')['resale_price'].mean().plot(kind='barh',ax=axesbi[1,0])
new_df.groupby('flat_model')['resale_price'].mean().plot(kind='barh',ax=axesbi[1,1])
sns.boxplot(x="no_times_resold", y="age", data=new_df,ax=axesbi[1,2])
sns.boxplot(x="age", y="resale_price", data=new_df,ax=axesbi[1,3])


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:462: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  box_data = remove_na(group_data)
Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fdf82725f8>

In [43]:
sns.jointplot(x="age", y="no_times_resold", data=new_df);



In [45]:
f, ax = plt.subplots(figsize=(10, 8))
corr = new_df.corr()
sns.heatmap(corr,
            mask=np.zeros_like(corr, dtype=np.bool), 
            cmap='hot',
            square=True, ax=ax)
#resaleprice and no_times_resold


Out[45]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fdffe95dd8>

In [47]:
ulimit = np.percentile(new_df.resale_price.values, 99.5)
llimit = np.percentile(new_df.resale_price.values, 0.5)
new_df['resale_price'].ix[new_df['resale_price']>ulimit] = ulimit
new_df['resale_price'].ix[new_df['resale_price']<llimit] = llimit

col = "floor_area_sqm"
ulimit = np.percentile(new_df[col].values, 99.5)
llimit = np.percentile(new_df[col].values, 0.5)
new_df[col].ix[new_df[col]>ulimit] = ulimit
new_df[col].ix[new_df[col]<llimit] = llimit

plt.figure(figsize=(12,12))
sns.jointplot(x=np.log1p(new_df.floor_area_sqm.values), y=np.log1p(new_df.resale_price.values), size=10)
plt.ylabel('Log of Price', fontsize=12)
plt.xlabel('Log of Total area in square metre', fontsize=12)


C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  This is separate from the ipykernel package so we can avoid doing imports until
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:194: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  if __name__ == '__main__':
Out[47]:
Text(0.5,86,'Log of Total area in square metre')
<matplotlib.figure.Figure at 0x1fdf700e390>

In [48]:
plt.figure(figsize=(12,8))
sns.countplot(x="storey_range", data=new_df)
plt.ylabel('Count', fontsize=12)
plt.xlabel('storey_range', fontsize=12)
plt.xticks(rotation='vertical')


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)
Out[48]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]),
 <a list of 24 Text xticklabel objects>)

In [22]:
grouped_df = new_df.groupby('storey_range')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(12,8))
sns.pointplot(grouped_df.storey_range.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Floor', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)

In [24]:
grouped_df = new_df.groupby('town')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(12,8))
sns.pointplot(grouped_df.town.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Town', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)

In [27]:
new_df.columns


Out[27]:
Index(['town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'lease_commence_date', 'resale_price',
       'building_id', 'month', 'year', 'age', 'no_times_resold'],
      dtype='object')

In [37]:
grouped_df = new_df.groupby('no_times_resold')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(15,15))
sns.pointplot(grouped_df.no_times_resold.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('no_times_resold', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)

In [32]:
grouped_df = new_df.groupby('lease_commence_date')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(15,15))
sns.pointplot(grouped_df.lease_commence_date.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('lease_commence_date', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)

In [31]:
grouped_df = new_df.groupby('floor_area_sqm')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(15,15))
sns.pointplot(grouped_df.floor_area_sqm.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Floor Area', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)

In [28]:
grouped_df = new_df.groupby('flat_type')['resale_price'].aggregate(np.median).reset_index()
plt.figure(figsize=(12,8))
sns.pointplot(grouped_df.flat_type.values, grouped_df.resale_price.values, alpha=0.8, color=color[2])
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Town', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()


C:\ProgramData\Anaconda3\lib\site-packages\seaborn\categorical.py:1460: FutureWarning: remove_na is deprecated and is a private function. Do not use.
  stat_data = remove_na(group_data)

In [ ]: