In [1]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ggplot import *
import brewer2mpl
import dateutil
from random import shuffle, randint
import json
import matplotlib as mpl

theme_bw()
%matplotlib inline

In [2]:
from sklearn.cross_validation import train_test_split

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import make_scorer    
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.grid_search import GridSearchCV

In [3]:
data = pd.read_csv('../data/run/warszawa/data.csv')
data = data.fillna(-1)

1 Understand data


In [4]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 31172 entries, 0 to 31171
Data columns (total 39 columns):
market            31172 non-null float64
area              31172 non-null float64
price             31172 non-null int64
floor             31172 non-null object
num_room          31172 non-null float64
num_bedroom       31172 non-null float64
num_bathroom      31172 non-null float64
kitchen           31172 non-null float64
basement          31172 non-null float64
garage            31172 non-null float64
voivodeship       31172 non-null object
powiat            31172 non-null object
gmina             31172 non-null object
place             31172 non-null object
loudness          31172 non-null float64
district          31172 non-null object
street            31172 non-null object
furnished         31172 non-null float64
state_building    31172 non-null float64
type_building     31172 non-null float64
year              31172 non-null float64
source_ad         31172 non-null float64
balcony           31172 non-null float64
elevator          31172 non-null float64
terrace           31172 non-null float64
num_offer         31172 non-null object
making            31172 non-null float64
num_levels        31172 non-null float64
type_ownership    31172 non-null float64
land_register     31172 non-null float64
availability      31172 non-null object
nature            31172 non-null object
busy_street       31172 non-null float64
technique         31172 non-null float64
type_roof         31172 non-null object
surroundings      31172 non-null float64
type_offer        31172 non-null object
offer_date        31172 non-null object
uri               31172 non-null object
dtypes: float64(24), int64(1), object(14)
memory usage: 9.5+ MB

In [5]:
data.describe()


Out[5]:
market area price num_room num_bedroom num_bathroom kitchen basement garage loudness ... balcony elevator terrace making num_levels type_ownership land_register busy_street technique surroundings
count 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 ... 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 31172.000000 31172
mean 0.429296 66.852206 575647.763442 2.671436 -0.761773 -0.399076 -0.062267 -0.555819 -0.998717 0.058193 ... -0.118311 -0.460413 -0.913769 1.355640 -0.083569 0.361093 -0.942480 -0.997337 -0.612633 -1
std 1.481844 39.872816 426376.082427 1.030958 0.828709 1.009968 1.229133 0.831316 0.035799 1.235826 ... 0.992992 0.887719 0.406241 2.289604 1.038190 1.642307 0.333835 0.061208 0.909627 0
min -1.000000 1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1
25% -1.000000 47.000000 349000.000000 2.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1
50% -1.000000 59.900000 455000.000000 3.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 ... -1.000000 -1.000000 -1.000000 1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 -1
75% 2.000000 79.660000 654000.000000 3.000000 -1.000000 1.000000 1.000000 -1.000000 -1.000000 1.000000 ... 1.000000 1.000000 -1.000000 1.000000 1.000000 2.000000 -1.000000 -1.000000 -1.000000 -1
max 2.000000 4000.000000 9932000.000000 22.000000 20.000000 3.000000 2.000000 1.000000 0.000000 4.000000 ... 1.000000 1.000000 1.000000 12.000000 2.000000 5.000000 1.000000 1.000000 5.000000 -1

8 rows × 25 columns

1.1 Base features


In [6]:
hist_columns = ['area', 'price', 'num_room', 'num_bedroom', 'num_bathroom', 'kitchen']
data[ hist_columns ].hist(figsize=(16,12),bins=30)
plt.show()


1.2 Area


In [7]:
def summary(values, percentiles=[1, 5, 95, 99]):
    for percnetile in percentiles:
        print '{0}th -> {1}'.format(percnetile, np.percentile(values, percnetile) )

In [8]:
summary(data.area.values)


1th -> 24.0071
5th -> 31.0
95th -> 125.25
99th -> 172.2

In [9]:
data['area_log'] = data.area.map(lambda x: np.log2(x))
summary(data.area_log.values)


1th -> 4.58538920911
5th -> 4.95419631039
95th -> 6.9686667932
99th -> 7.42794133251

In [11]:
plt.figure(figsize=(500,20))
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4,2, figsize=(15, 10))
fig.subplots_adjust(hspace=.8)

def sub_plot(data, column, ax1, ax2):
    data.hist(column, bins=20, ax=ax1)
    data.boxplot(column, ax=ax2)

sub_plot(data, 'area', ax1, ax2)
sub_plot(data[ data.area < 172 ], 'area', ax3, ax4)
sub_plot(data, 'area_log', ax5, ax6)
sub_plot(data[ (data.area_log > 4) & (data.area_log < 8)], 'area_log', ax7, ax8)


<matplotlib.figure.Figure at 0x1125c6090>

area_log without outliers looks like normal distribution, so removed those outliers.


In [12]:
data = data[ (data.area_log > 4) & (data.area_log < 8)]

1.3 Price


In [13]:
summary(data.price.values, range(95, 100, 1))


95th -> 1250000.0
96th -> 1350000.0
97th -> 1499000.0
98th -> 1700000.0
99th -> 2200000.0

In [14]:
data['price_log'] = data.price.map(lambda x: np.log2(x))
summary(data['price_log'].values)


1th -> 17.5522187854
5th -> 17.9005948197
95th -> 20.2534966642
99th -> 21.0690720931

In [15]:
plt.figure(figsize=(500,20))
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4,2, figsize=(15, 10))
fig.subplots_adjust(hspace=.8)

sub_plot(data, 'price', ax1, ax2)
sub_plot(data[ data.price < 1700000 ], 'price', ax3, ax4)
sub_plot(data, 'price_log', ax5, ax6)
sub_plot(data[ (data.price_log > 17) & (data.price_log < 21)], 'price_log', ax7, ax8)


<matplotlib.figure.Figure at 0x1150bad10>

Removed outliers based by price


In [16]:
data = data[ (data.price_log > 17) & (data.price_log < 21)]

2. Prepare Data

  1. Correlation matrix
  2. Importance features
  3. Feature engineering
  4. Feature selection

2.1 Correlation matrix

Let's to see correlation matrix


In [17]:
f, ax = plt.subplots(figsize=(20,20))
sns.corrplot(data, sig_stars=False, ax=ax)


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x1167d7f10>

In [18]:
def select_features(data):
    columns = data.loc[ :, (data.dtypes == np.int64) | (data.dtypes == np.float64) | (data.dtypes == np.bool) ].columns.values    
    return [feat for feat in columns if 'price' not in feat]

def get_X_y(data, cols=None):
    if not cols:
        cols = select_features(data)
        
    X = data[cols].values
    y = data['price'].values
    
    return X,y

def get_importance__features(data, model=RandomForestRegressor(), limit=25):
    X,y = get_X_y(data)
    cols = select_features(data)
    
    model.fit(X, y)
    feats = pd.DataFrame(model.feature_importances_, index=data[cols].columns)
    feats = feats.sort([0], ascending=False) [:limit]
    return feats.rename(columns={0:'name'})
    
def draw_importance_features(data, model=RandomForestRegressor(), limit=30):
    feats = get_importance__features(data, model, limit)
    feats.plot(kind='bar', figsize=(20, 8))


draw_importance_features(data)


Let's try add new feature - price by meters and log of this value


In [19]:
data.loc[ : , 'price_m2'] = data.apply(lambda x: x['price'] / float(x['area']), axis=1)
summary(data['price_m2'].values)


1th -> 4760.0
5th -> 5593.22033898
95th -> 12766.1058882
99th -> 16897.8217352

In [20]:
data.loc[ : , 'price_m2_log'] = data['price'].map(lambda x: np.log2(x))
summary(data['price_m2_log'].values)


1th -> 17.5656971268
5th -> 17.8985156214
95th -> 20.1825301429
99th -> 20.671416672

In [21]:
plt.figure(figsize=(500,20))
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3,2, figsize=(15, 10))
fig.subplots_adjust(hspace=.8)

sub_plot(data, 'price_m2', ax1, ax2)
sub_plot(data[ data.price_m2 < 16897 ], 'price_m2', ax3, ax4)
sub_plot(data, 'price_m2_log', ax5, ax6)


<matplotlib.figure.Figure at 0x118dda110>

Let's removed outliers based on price by m^2


In [22]:
data = data[ data.price_m2 < 16897 ]

In [23]:
print draw_importance_features(data)


None

In [24]:
data['floor_num'] = data['floor'].map(lambda x: float(x.split('/')[0]) if isinstance(x, basestring) else -1 )
data['floor_build'] = data['floor'].map(lambda x: float(x.split('/')[1]) if isinstance(x, basestring) else -1)
data['floor_ratio'] = (data['floor_build'] + 1) / (data['floor_num']  + 1)
data['floor_ratio'].fillna(-1, inplace=True)

data['last_floor'] = data['floor_num'] == data['floor_build']
data['zero_floor'] = data['floor_num'] == 0
data['one_floor'] = data['floor_num'] == 1
data['two_floor'] = data['floor_num'] == 2
data['three_floor'] = data['floor_num'] == 3
data['four_floor'] = data['floor_num'] == 4
data['five_floor'] = data['floor_num'] == 5

draw_importance_features(data)



In [25]:
floor_cols = [c for c in data.columns.values if 'floor' in c]
data[ floor_cols ].hist(figsize=(16,12),bins=20)
plt.show()



In [26]:
def year_decade(year):
    if year < 1901:
        return 1900
    elif year < 1931:
        return 1930
    elif year < 1941:
        return 1940
    elif year < 1951:
        return 1950
    elif year < 1961:
        return 1960
    elif year < 1971:
        return 1970
    elif year < 1981:
        return 1980
    elif year < 1991:
        return 1990
    elif year < 2001:
        return 2000
    elif year < 2006:
        return 2005
    elif year < 2011:
        return 2010
    elif year < 2012:
        return 2012
    elif year < 2015:
        return 2014
    else:
        return year
    

data['year_decade'] = data['year'].map(year_decade)

data['year_decade'].hist(bins=20)
draw_importance_features(data)



In [27]:
data['how_old'] = data.year.map(lambda x: -1 if x == -1 else 2015 - x)
summary(data['how_old'].values)


1th -> -1.0
5th -> -1.0
95th -> 80.0
99th -> 103.0

In [28]:
data['how_old_log'] = data['how_old'].map(lambda x: -1 if x < 1 else np.log2(x))

summary(data['how_old_log'].values)


1th -> -1.0
5th -> -1.0
95th -> 6.32192809489
99th -> 6.68650052718

In [29]:
plt.figure(figsize=(500,20))
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3,2, figsize=(15, 10))
fig.subplots_adjust(hspace=.8)

sub_plot(data, 'how_old', ax1, ax2)
sub_plot(data[ data.how_old < 103 ], 'how_old', ax3, ax4)
sub_plot(data[ False == data.how_old.isnull() ], 'how_old_log', ax5, ax6)


<matplotlib.figure.Figure at 0x11cfd1250>

In [30]:
draw_importance_features(data)


Normalize districts


In [31]:
import json

districts = None
with open('../data/info/warszawa/district.json', 'r') as f:
    districts = json.loads(f.read())


subdistrict_to_district = {}
for key in districts.keys():
    for value in districts[key]:
        subdistrict_to_district[value] = key

def norm_district(district):
    if False == isinstance (district, basestring):
        return district

    district = district.decode("utf8")
    if  district in districts:
        return district
    elif district in subdistrict_to_district:
        return subdistrict_to_district[district]
    else:
        return 'unknown'
    
data['district_norm'] = data['district'].map(norm_district)
district = data.groupby('district_norm').count()['market'].reset_index()
district.columns = ['district', 'count']
district.plot(kind='bar')

district


Out[31]:
district count
0 -1 378
1 bemowo 2158
2 białołęka 1805
3 bielany 1737
4 mokotów 4012
5 ochota 1670
6 praga-południe 3423
7 praga-północ 752
8 rembertów 211
9 targówek 1294
10 unknown 2
11 ursus 999
12 ursynów 2306
13 wawer 811
14 wesoła 411
15 wilanów 853
16 wola 2478
17 włochy 626
18 śródmieście 3574
19 żoliborz 931

In [32]:
data['district_map'], district_indexes =  pd.factorize(data.district_norm)
print list(enumerate(district_indexes))

print draw_importance_features(data)


[(0, u'ochota'), (1, -1), (2, u'targ\xf3wek'), (3, u'ursyn\xf3w'), (4, u'weso\u0142a'), (5, u'\u015br\xf3dmie\u015bcie'), (6, u'bielany'), (7, u'wola'), (8, u'ursus'), (9, u'w\u0142ochy'), (10, u'wilan\xf3w'), (11, u'praga-p\xf3\u0142noc'), (12, u'mokot\xf3w'), (13, u'bia\u0142o\u0142\u0119ka'), (14, u'praga-po\u0142udnie'), (15, u'bemowo'), (16, u'\u017coliborz'), (17, u'wawer'), (18, u'rembert\xf3w'), (19, 'unknown')]
None

In [33]:
data['street_map'], district_indexes =  pd.factorize(data.street)

draw_importance_features(data)


Let's try better understand next features

  • availability
  • nature
  • type_roof
  • offer_date

In [34]:
data['offer_date'] = pd.to_datetime(data['offer_date'])
data['offer_date_year'] = data['offer_date'].map(lambda x: int(x.year) )
data['offer_date_month'] = data['offer_date'].map(lambda x: int(x.month) )
data['offer_date_day'] = data['offer_date'].map(lambda x: int(x.day) )
data['offer_date_dayofweek'] = data['offer_date'].map(lambda x: int(x.dayofweek) )
data['offer_date_weekofyear'] = data['offer_date'].map(lambda x: int(x.weekofyear) )

In [35]:
draw_importance_features(data)



In [36]:
def map_nature(value):
    if value == -1: return value
    return sum([ 2**int(p) for p in value.split(',')])

data['nature_map'] =  data['nature'].map(map_nature)
draw_importance_features(data)



In [37]:
data['type_roof_map'], indexes_roof = pd.factorize( data['type_roof'] )
draw_importance_features(data, limit=40)



In [38]:
def map_availability(value):
    if value == -1: return value
    if value == 'natychmiast': return -2
    if value == 'do uzgodnienia': return -3
    
    values = value.split(' ')
    if len(values) == 2:
        month = values[0]
        year = int(values[1])
        pl_months = {
            'styczeń': 1,
            'luty': 2,
            'marzec': 3,
            'kwiecień': 4,
            'maj': 5,
            'czerwiec': 6,
            'lipiec': 7,
            'sierpień': 8,
            'wrzesień': 9,
            'październik': 10,
            'listopad': 11,
            'grudzień': 12,
        }
            
        return datetime.datetime(year, pl_months[month], 1)
        
    return -4
    
data['availability_map'] = data['availability'].map(map_availability)

In [39]:
def availability_diff(x):
    if isinstance(x['availability_map'], int): return x['availability_map']

    try:
        return ( x['offer_date']  - x['availability_map'] ).days
    except:
        return 0

data['availability_diff'] = data.apply(availability_diff, axis=1)

draw_importance_features(data, limit=40)


Generate new features

Generate permutation for all numerical features and multiply them


In [40]:
def get_num_features(data):
    columns = data.loc[ :, (data.dtypes == np.float) |  (data.dtypes == np.int64) ].columns.values
    
    return [c for c in columns if 'price' not in c]

In [41]:
import itertools
num_cols = get_num_features(data)
for feat_x, feat_y in itertools.product(num_cols, num_cols):
    name = '{0}_{1}'.format( feat_x,feat_y )
    data[name] =  data[feat_x] * data[feat_y]

In [42]:
draw_importance_features(data, limit=100)



In [ ]:
print get_importance__features(data)

Train/test split


In [43]:
def get_train_test(data, target_column, cols = None):
    data['is_test'] = data['price'].map( lambda x: randint(1, 3) == 3 )
    if cols is None:
        cols = ['area', 'area_log', 'num_room', 'year', 'making']

    train = data[ False == data['is_test'] ]
    test  = data[ True  == data['is_test'] ]

    X_train = train[cols].values
    y_train = train[target_column].values
    X_test  = test[cols].values
    y_test  = test[target_column].values
    
    return X_train, X_test, y_train, y_test


cols = select_features(data)
X_train, X_test, y_train, y_test = get_train_test(data, 'price', cols)
train = data[ False == data.is_test ] 
test  = data[ True == data.is_test ]

Quality function - MAPE


In [44]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

Single variable model

It's the simple possible (and relatively sensible) model which we can treat like base line.


In [45]:
def quality_func(price, pred_price):
    return {
        'r2': r2_score(price, pred_price),
        'mae': mean_absolute_error(price, pred_price),
        'mape': mean_absolute_percentage_error(price, pred_price)
    }

In [47]:
mean_price = train['price_m2'].mean()
test.loc[ : , 'pred_price_mean'] = test['area'].map(lambda area: area * mean_price)

quality_func(test['price'].values, test['pred_price_mean'].values)


Out[47]:
{'mae': 109649.79016612926,
 'mape': 19.710614334014775,
 'r2': 0.67634831215842905}

Two variables model

Based on price and district.


In [48]:
district_prices = data[ ['district_map', 'price_m2'] ].groupby('district_map').agg(np.mean).to_dict()['price_m2']
district_prices
test.loc[ : , 'pred_price_mean_district'] = test.apply(lambda x: district_prices[ x['district_map'] ] * x['area'], axis=1)

quality_func(test['price'].values, test['pred_price_mean_district'].values)


Out[48]:
{'mae': 86369.033182100568,
 'mape': 15.19984057742508,
 'r2': 0.7875712966881987}

More advanced models


In [49]:
def models():
    yield ('extra_tree', ExtraTreesRegressor())
    yield ('random_forest', RandomForestRegressor())
    yield ('bagging', BaggingRegressor())
    yield ('gradient_boos', GradientBoostingRegressor())
    yield ('dec_tree', DecisionTreeRegressor())

In [50]:
for name, model in models():
    model.fit(X_train, y_train)
    
    key_pred_model = 'pred_price_{0}'.format(name)
    test.loc[ : , key_pred_model ] = model.predict(X_test)

    print model
    print quality_func(test['price'].values, test[key_pred_model].values)


ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
          verbose=0, warm_start=False)
{'mae': 65587.142847364812, 'r2': 0.85297108967196378, 'mape': 11.316634641658307}
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)
{'mae': 72850.970142526552, 'r2': 0.82898077306240547, 'mape': 12.633967400063273}
BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
         verbose=0)
{'mae': 73246.82125950449, 'r2': 0.82941029937773936, 'mape': 12.695201677518666}
GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=3, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             random_state=None, subsample=1.0, verbose=0, warm_start=False)
{'mae': 77036.997750222523, 'r2': 0.82664219117963855, 'mape': 13.608671827238753}
DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, random_state=None,
           splitter='best')
{'mae': 91605.255304585895, 'r2': 0.69141793256823858, 'mape': 15.712987647344354}

Predict price_log


In [77]:
#cols = select_features(data)
#print len(cols)

def run_log_model(train, test, model=ExtraTreesRegressor(), cols=None, output_column='pred_price_log'):
    if cols is None:
        cols = select_features(data)
        
    X_train    = train[cols].values
    y_train    = train['price_log'].values
    X_test     = test[cols].values
    y_test_log = test['price_log'].values
    y_test     = test['price'].values

    model.fit(X_train, y_train)
    y_pred_log = model.predict(X_test)
    test.loc[ : , output_column ] = np.exp2(y_pred_log)

    return quality_func(y_test, test[output_column].values)

cols = select_features(data)
#print run_log_model(train, test, ExtraTreesRegressor(), cols)

In [78]:
def models():
    yield ('extra_tree', ExtraTreesRegressor())
    yield ('random_forest', RandomForestRegressor())
    yield ('bagging', BaggingRegressor())
    yield ('gradient_boos', GradientBoostingRegressor())
    yield ('dec_tree', DecisionTreeRegressor())

In [79]:
cols = select_features(data)
for model_name, model in models():
    output_column = 'pred_price_log_{0}'.format(model_name)
    print model_name
    print run_log_model(train, test, model, cols, output_column)


extra_tree
{'mae': 63865.200188039322, 'r2': 0.85624665606435324, 'mape': 10.825057263135582}
random_forest
{'mae': 72834.082662102068, 'r2': 0.82077604089792711, 'mape': 12.244500751534211}
bagging
{'mae': 71595.330402307474, 'r2': 0.82893644429368973, 'mape': 12.097151423914633}
gradient_boos
{'mae': 74747.759406436657, 'r2': 0.82476713709650817, 'mape': 12.579985540278635}
dec_tree
{'mae': 92426.863249638496, 'r2': 0.68162316816696467, 'mape': 15.726886354553384}

In [85]:
for n_estimators in range(10, 101, 20):
    for min_samples_split in [2, 4, 6]:
        output_column = 'pred_price_log_extra_tree_tun_{0}_{1}'.format(n_estimators, min_samples_split)
        model = ExtraTreesRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split, n_jobs=-1)

        print output_column
        print run_log_model(train, test, model, cols, output_column)


pred_price_log_extra_tree_tun_10_2
{'mae': 65197.2281798892, 'r2': 0.84604391844707505, 'mape': 10.971295874094416}
pred_price_log_extra_tree_tun_10_4
{'mae': 63554.077931728032, 'r2': 0.85461801043595365, 'mape': 10.741056371457262}
pred_price_log_extra_tree_tun_10_6
{'mae': 63677.875685163926, 'r2': 0.85401724860791517, 'mape': 10.743046272899033}
pred_price_log_extra_tree_tun_30_2
{'mae': 61695.966176786744, 'r2': 0.86087183648113497, 'mape': 10.349230329332286}
pred_price_log_extra_tree_tun_30_4
{'mae': 61673.84165025191, 'r2': 0.86287818746369715, 'mape': 10.378112104206757}
pred_price_log_extra_tree_tun_30_6
{'mae': 61987.596759509295, 'r2': 0.860865202076882, 'mape': 10.416739151717898}
pred_price_log_extra_tree_tun_50_2
{'mae': 61137.775141291444, 'r2': 0.86444447164645632, 'mape': 10.278591559709074}
pred_price_log_extra_tree_tun_50_4
{'mae': 60906.850852916381, 'r2': 0.86599498965627719, 'mape': 10.244707506668055}
pred_price_log_extra_tree_tun_50_6
{'mae': 61507.687801283922, 'r2': 0.86227175192064298, 'mape': 10.30764399397256}
pred_price_log_extra_tree_tun_70_2
{'mae': 60742.923112858865, 'r2': 0.8652065801083656, 'mape': 10.181662961276718}
pred_price_log_extra_tree_tun_70_4
{'mae': 60840.177846236089, 'r2': 0.86549466941727771, 'mape': 10.208707947245047}
pred_price_log_extra_tree_tun_70_6
{'mae': 60609.639766076078, 'r2': 0.86651842911176347, 'mape': 10.196098723572893}
pred_price_log_extra_tree_tun_90_2
{'mae': 60302.222902434507, 'r2': 0.86673686381439041, 'mape': 10.138784797573084}
pred_price_log_extra_tree_tun_90_4
{'mae': 60901.819122780835, 'r2': 0.86509555445018327, 'mape': 10.203844453033508}
pred_price_log_extra_tree_tun_90_6
{'mae': 60925.83039343408, 'r2': 0.86480567437737077, 'mape': 10.208664470868587}

Ensemble models


In [86]:
pred_columns = [ c for c in test.columns.values if 'pred' in c ]

def run_ensemble(test, columns):
    test.loc[ : , 'pred_price_ensemble'] = test[pred_columns].apply(np.mean, axis=1)
    return quality_func(test['price'].values, test['pred_price_ensemble'].values)
    
run_ensemble(test, pred_columns)


Out[86]:
{'mae': 63009.391086174837,
 'mape': 10.661382149892017,
 'r2': 0.86295739672483507}

In [88]:
test[ ['price', 'pred_price_ensemble'] + pred_columns ].head()


Out[88]:
price pred_price_ensemble pred_price_mean pred_price_mean_district pred_price_extra_tree pred_price_random_forest pred_price_bagging pred_price_gradient_boos pred_price_dec_tree pred_price_log ... pred_price_log_extra_tree_tun_30_6 pred_price_log_extra_tree_tun_50_2 pred_price_log_extra_tree_tun_50_4 pred_price_log_extra_tree_tun_50_6 pred_price_log_extra_tree_tun_70_2 pred_price_log_extra_tree_tun_70_4 pred_price_log_extra_tree_tun_70_6 pred_price_log_extra_tree_tun_90_2 pred_price_log_extra_tree_tun_90_4 pred_price_log_extra_tree_tun_90_6
0 240000 245607.253779 230006.594237 240894.656695 253000.0 253400.0 237430.0 266376.113197 244000 238133.365937 ... 242313.004908 246019.354582 251731.764409 241498.896956 247033.178913 244614.892276 252377.543185 254618.184070 243255.796254 248327.448428
3 345000 312309.632530 279293.721574 281341.430709 300950.0 319150.0 314300.0 315508.999374 309000 314423.062331 ... 315363.174313 320761.643182 323528.212172 320954.364649 317309.091110 313508.671468 326858.786021 315838.133987 311070.006759 318184.433234
7 210000 233157.319040 216781.215069 217716.224961 241610.0 220518.0 239420.0 238949.641375 278653 235674.070413 ... 219915.755213 233711.491253 229910.815319 234574.214082 227747.908632 228158.422155 237764.978327 228706.248391 233036.430218 231509.540687
16 305000 328228.085708 369653.455025 407632.867296 295400.0 355100.0 332400.0 337598.829007 304000 320282.571248 ... 311465.341662 304974.372297 317777.556983 316071.619329 316160.991859 321357.128338 328128.100319 318615.405361 319158.969796 315921.385775
18 156702 155489.347867 161004.615966 142605.260977 149145.4 178373.1 159753.3 226060.733897 136714 151140.367134 ... 148443.098996 148778.168608 150359.889808 152231.773264 151128.942345 152921.014066 146928.545209 149961.828490 153608.557066 149783.952789

5 rows × 31 columns


In [89]:
def get_winner(row):
    pred_columns = [ c for c in test.columns.values if 'pred' in c]
    min_diff = 1e10
    p_col = pred_columns[0]
    
    for p_col in pred_columns:
        
        diff = abs(row['price'] - row[p_col])
        
        if diff < min_diff:
            winner = p_col
            min_diff = diff
            
    return winner


test.loc[ : , 'winner'] = test.apply(get_winner, axis=1)
test.groupby('winner').count()['market'].reset_index().sort('market', ascending=False)


Out[89]:
winner market
1 pred_price_dec_tree 1435
7 pred_price_log_dec_tree 1102
27 pred_price_mean_district 664
3 pred_price_extra_tree 548
26 pred_price_mean 447
8 pred_price_log_extra_tree 404
28 pred_price_random_forest 397
10 pred_price_log_extra_tree_tun_10_4 396
9 pred_price_log_extra_tree_tun_10_2 395
25 pred_price_log_random_forest 388
5 pred_price_log 387
11 pred_price_log_extra_tree_tun_10_6 387
0 pred_price_bagging 372
4 pred_price_gradient_boos 366
6 pred_price_log_bagging 354
24 pred_price_log_gradient_boos 295
14 pred_price_log_extra_tree_tun_30_6 222
13 pred_price_log_extra_tree_tun_30_4 197
12 pred_price_log_extra_tree_tun_30_2 190
16 pred_price_log_extra_tree_tun_50_4 158
15 pred_price_log_extra_tree_tun_50_2 148
17 pred_price_log_extra_tree_tun_50_6 148
2 pred_price_ensemble 142
20 pred_price_log_extra_tree_tun_70_6 124
18 pred_price_log_extra_tree_tun_70_2 120
19 pred_price_log_extra_tree_tun_70_4 118
23 pred_price_log_extra_tree_tun_90_6 113
21 pred_price_log_extra_tree_tun_90_2 109
22 pred_price_log_extra_tree_tun_90_4 101

In [90]:
pred_columns = [ c for c in test.columns.values if 'pred' in c and c not in ['pred_price_mean', 'pred_price_gradient_boos'] ]
run_ensemble(test, pred_columns)


Out[90]:
{'mae': 61866.967607731975,
 'mape': 10.438472272634575,
 'r2': 0.86542902865466909}

Understanding area with the biggest errors


In [91]:
for pred_column in [ c for c in test.columns.values if 'pred' in c]:
    error_column_name = 'error_{0}'.format(pred_column[len('pred_price_'):])
    test.loc[ : , error_column_name] = test['price'] - test[pred_column]

In [92]:
test_thin = test[ [ c for c in test.columns.values if 'error_' in c] + ['price', 'area', 'district_map'] ]
test_melt = pd.melt( test_thin, id_vars=['price','area', 'district_map'] )
test_melt.head()


Out[92]:
price area district_map variable value
0 240000 28.00 0 error_mean 9993.405763
1 345000 34.00 3 error_mean 65706.278426
2 210000 26.39 7 error_mean -6781.215069
3 305000 45.00 12 error_mean -64653.455025
4 156702 19.60 9 error_mean -4302.615966

In [ ]:
test_melt['value_binary'] = test_melt['value'].map(lambda x: 1 if x >= 0 else 0 )

gsize = theme_matplotlib(rc={"figure.figsize": "20, 40"}, matplotlib_defaults=False)
ggplot(aes(x='price', y='area', colour='value_binary'), test_melt.reset_index() ) \
    + geom_point() \
    + facet_wrap("variable", scales = "free_x") \
    + theme_bw()

In [96]:
gsize = theme_matplotlib(rc={"figure.figsize": "20, 40"}, matplotlib_defaults=False)

ggplot(aes(x='price', y='value', color='district_map'), test_melt) \
    + geom_point() \
    + facet_wrap("variable", scales = "free_x", ncol=3) \
    + geom_hline(color='red', linetype='dotted', yintercept=[-500000, 500000]) \
    + geom_hline(color='red', linetype='solid', yintercept=[-800000, 800000]) \
    + gsize


Out[96]:
<ggplot: (312545841)>

Try to build model per district


In [130]:
df_X, df_y = get_X_y(data)
df_train_X, df_test_X, df_train_y, df_test_y = train_test_split(df_X, df_y) 

models_by_district = {}
## learning
for model_name, model in models():
    print model_name
    model.fit(df_train_X, df_train_y)
    models_by_district[model_name] = model


extra_tree
random_forest
bagging
gradient_boos
dec_tree

In [142]:
## predicting
predicted_values_by_model = [ model.predict(df_test_X) for model in models_by_district.values() ]

pred_y = [ np.mean(values) for values in zip(*predicted_values_by_model) ]
print quality_func(df_test_y, pred_y)


{'mae': 66549.465702032423, 'r2': 0.85366826522927175, 'mape': 11.536420291455025}

Learning curve


In [111]:
# Modified from http://scikit-learn.org/stable/auto_examples/plot_learning_curve.html
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        train_sizes=np.linspace(.1, 1.0, 5)):
    
    plt.figure()
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=5, n_jobs=-1, train_sizes=train_sizes, scoring='r2')
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.xlabel("Training examples")
    plt.ylabel("Recall")
    plt.legend(loc="best")
    plt.grid("on") 
    if ylim:
        plt.ylim(ylim)
    plt.title(title)
    
def plot(df, est, title):
    X, y = get_X_y(df)
    plot_learning_curve(est, title, X, y, ylim=(0.0, 1.01),
        train_sizes=np.linspace(.05, 0.2, 5))

In [112]:
plot(data, RandomForestRegressor(n_estimators=10, max_depth=10), 'Random forest 10,10')



In [ ]: