In [1]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ggplot import *
import brewer2mpl
import dateutil
from random import shuffle, randint
import json
import matplotlib as mpl
theme_bw()
%matplotlib inline
In [2]:
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.grid_search import GridSearchCV
In [3]:
data = pd.read_csv('../data/run/warszawa/data.csv')
data = data.fillna(-1)
In [4]:
data.info()
In [5]:
data.describe()
Out[5]:
In [6]:
hist_columns = ['area', 'price', 'num_room', 'num_bedroom', 'num_bathroom', 'kitchen']
data[ hist_columns ].hist(figsize=(16,12),bins=30)
plt.show()
In [7]:
def summary(values, percentiles=[1, 5, 95, 99]):
for percnetile in percentiles:
print '{0}th -> {1}'.format(percnetile, np.percentile(values, percnetile) )
In [8]:
summary(data.area.values)
In [9]:
data['area_log'] = data.area.map(lambda x: np.log2(x))
summary(data.area_log.values)
In [11]:
plt.figure(figsize=(500,20))
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4,2, figsize=(15, 10))
fig.subplots_adjust(hspace=.8)
def sub_plot(data, column, ax1, ax2):
data.hist(column, bins=20, ax=ax1)
data.boxplot(column, ax=ax2)
sub_plot(data, 'area', ax1, ax2)
sub_plot(data[ data.area < 172 ], 'area', ax3, ax4)
sub_plot(data, 'area_log', ax5, ax6)
sub_plot(data[ (data.area_log > 4) & (data.area_log < 8)], 'area_log', ax7, ax8)
area_log without outliers looks like normal distribution, so removed those outliers.
In [12]:
data = data[ (data.area_log > 4) & (data.area_log < 8)]
In [13]:
summary(data.price.values, range(95, 100, 1))
In [14]:
data['price_log'] = data.price.map(lambda x: np.log2(x))
summary(data['price_log'].values)
In [15]:
plt.figure(figsize=(500,20))
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6), (ax7, ax8)) = plt.subplots(4,2, figsize=(15, 10))
fig.subplots_adjust(hspace=.8)
sub_plot(data, 'price', ax1, ax2)
sub_plot(data[ data.price < 1700000 ], 'price', ax3, ax4)
sub_plot(data, 'price_log', ax5, ax6)
sub_plot(data[ (data.price_log > 17) & (data.price_log < 21)], 'price_log', ax7, ax8)
Removed outliers based by price
In [16]:
data = data[ (data.price_log > 17) & (data.price_log < 21)]
In [17]:
f, ax = plt.subplots(figsize=(20,20))
sns.corrplot(data, sig_stars=False, ax=ax)
Out[17]:
In [18]:
def select_features(data):
columns = data.loc[ :, (data.dtypes == np.int64) | (data.dtypes == np.float64) | (data.dtypes == np.bool) ].columns.values
return [feat for feat in columns if 'price' not in feat]
def get_X_y(data, cols=None):
if not cols:
cols = select_features(data)
X = data[cols].values
y = data['price'].values
return X,y
def get_importance__features(data, model=RandomForestRegressor(), limit=25):
X,y = get_X_y(data)
cols = select_features(data)
model.fit(X, y)
feats = pd.DataFrame(model.feature_importances_, index=data[cols].columns)
feats = feats.sort([0], ascending=False) [:limit]
return feats.rename(columns={0:'name'})
def draw_importance_features(data, model=RandomForestRegressor(), limit=30):
feats = get_importance__features(data, model, limit)
feats.plot(kind='bar', figsize=(20, 8))
draw_importance_features(data)
Let's try add new feature - price by meters and log of this value
In [19]:
data.loc[ : , 'price_m2'] = data.apply(lambda x: x['price'] / float(x['area']), axis=1)
summary(data['price_m2'].values)
In [20]:
data.loc[ : , 'price_m2_log'] = data['price'].map(lambda x: np.log2(x))
summary(data['price_m2_log'].values)
In [21]:
plt.figure(figsize=(500,20))
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3,2, figsize=(15, 10))
fig.subplots_adjust(hspace=.8)
sub_plot(data, 'price_m2', ax1, ax2)
sub_plot(data[ data.price_m2 < 16897 ], 'price_m2', ax3, ax4)
sub_plot(data, 'price_m2_log', ax5, ax6)
Let's removed outliers based on price by m^2
In [22]:
data = data[ data.price_m2 < 16897 ]
In [23]:
print draw_importance_features(data)
In [24]:
data['floor_num'] = data['floor'].map(lambda x: float(x.split('/')[0]) if isinstance(x, basestring) else -1 )
data['floor_build'] = data['floor'].map(lambda x: float(x.split('/')[1]) if isinstance(x, basestring) else -1)
data['floor_ratio'] = (data['floor_build'] + 1) / (data['floor_num'] + 1)
data['floor_ratio'].fillna(-1, inplace=True)
data['last_floor'] = data['floor_num'] == data['floor_build']
data['zero_floor'] = data['floor_num'] == 0
data['one_floor'] = data['floor_num'] == 1
data['two_floor'] = data['floor_num'] == 2
data['three_floor'] = data['floor_num'] == 3
data['four_floor'] = data['floor_num'] == 4
data['five_floor'] = data['floor_num'] == 5
draw_importance_features(data)
In [25]:
floor_cols = [c for c in data.columns.values if 'floor' in c]
data[ floor_cols ].hist(figsize=(16,12),bins=20)
plt.show()
In [26]:
def year_decade(year):
if year < 1901:
return 1900
elif year < 1931:
return 1930
elif year < 1941:
return 1940
elif year < 1951:
return 1950
elif year < 1961:
return 1960
elif year < 1971:
return 1970
elif year < 1981:
return 1980
elif year < 1991:
return 1990
elif year < 2001:
return 2000
elif year < 2006:
return 2005
elif year < 2011:
return 2010
elif year < 2012:
return 2012
elif year < 2015:
return 2014
else:
return year
data['year_decade'] = data['year'].map(year_decade)
data['year_decade'].hist(bins=20)
draw_importance_features(data)
In [27]:
data['how_old'] = data.year.map(lambda x: -1 if x == -1 else 2015 - x)
summary(data['how_old'].values)
In [28]:
data['how_old_log'] = data['how_old'].map(lambda x: -1 if x < 1 else np.log2(x))
summary(data['how_old_log'].values)
In [29]:
plt.figure(figsize=(500,20))
fig, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3,2, figsize=(15, 10))
fig.subplots_adjust(hspace=.8)
sub_plot(data, 'how_old', ax1, ax2)
sub_plot(data[ data.how_old < 103 ], 'how_old', ax3, ax4)
sub_plot(data[ False == data.how_old.isnull() ], 'how_old_log', ax5, ax6)
In [30]:
draw_importance_features(data)
In [31]:
import json
districts = None
with open('../data/info/warszawa/district.json', 'r') as f:
districts = json.loads(f.read())
subdistrict_to_district = {}
for key in districts.keys():
for value in districts[key]:
subdistrict_to_district[value] = key
def norm_district(district):
if False == isinstance (district, basestring):
return district
district = district.decode("utf8")
if district in districts:
return district
elif district in subdistrict_to_district:
return subdistrict_to_district[district]
else:
return 'unknown'
data['district_norm'] = data['district'].map(norm_district)
district = data.groupby('district_norm').count()['market'].reset_index()
district.columns = ['district', 'count']
district.plot(kind='bar')
district
Out[31]:
In [32]:
data['district_map'], district_indexes = pd.factorize(data.district_norm)
print list(enumerate(district_indexes))
print draw_importance_features(data)
In [33]:
data['street_map'], district_indexes = pd.factorize(data.street)
draw_importance_features(data)
In [34]:
data['offer_date'] = pd.to_datetime(data['offer_date'])
data['offer_date_year'] = data['offer_date'].map(lambda x: int(x.year) )
data['offer_date_month'] = data['offer_date'].map(lambda x: int(x.month) )
data['offer_date_day'] = data['offer_date'].map(lambda x: int(x.day) )
data['offer_date_dayofweek'] = data['offer_date'].map(lambda x: int(x.dayofweek) )
data['offer_date_weekofyear'] = data['offer_date'].map(lambda x: int(x.weekofyear) )
In [35]:
draw_importance_features(data)
In [36]:
def map_nature(value):
if value == -1: return value
return sum([ 2**int(p) for p in value.split(',')])
data['nature_map'] = data['nature'].map(map_nature)
draw_importance_features(data)
In [37]:
data['type_roof_map'], indexes_roof = pd.factorize( data['type_roof'] )
draw_importance_features(data, limit=40)
In [38]:
def map_availability(value):
if value == -1: return value
if value == 'natychmiast': return -2
if value == 'do uzgodnienia': return -3
values = value.split(' ')
if len(values) == 2:
month = values[0]
year = int(values[1])
pl_months = {
'styczeń': 1,
'luty': 2,
'marzec': 3,
'kwiecień': 4,
'maj': 5,
'czerwiec': 6,
'lipiec': 7,
'sierpień': 8,
'wrzesień': 9,
'październik': 10,
'listopad': 11,
'grudzień': 12,
}
return datetime.datetime(year, pl_months[month], 1)
return -4
data['availability_map'] = data['availability'].map(map_availability)
In [39]:
def availability_diff(x):
if isinstance(x['availability_map'], int): return x['availability_map']
try:
return ( x['offer_date'] - x['availability_map'] ).days
except:
return 0
data['availability_diff'] = data.apply(availability_diff, axis=1)
draw_importance_features(data, limit=40)
In [40]:
def get_num_features(data):
columns = data.loc[ :, (data.dtypes == np.float) | (data.dtypes == np.int64) ].columns.values
return [c for c in columns if 'price' not in c]
In [41]:
import itertools
num_cols = get_num_features(data)
for feat_x, feat_y in itertools.product(num_cols, num_cols):
name = '{0}_{1}'.format( feat_x,feat_y )
data[name] = data[feat_x] * data[feat_y]
In [42]:
draw_importance_features(data, limit=100)
In [ ]:
print get_importance__features(data)
In [43]:
def get_train_test(data, target_column, cols = None):
data['is_test'] = data['price'].map( lambda x: randint(1, 3) == 3 )
if cols is None:
cols = ['area', 'area_log', 'num_room', 'year', 'making']
train = data[ False == data['is_test'] ]
test = data[ True == data['is_test'] ]
X_train = train[cols].values
y_train = train[target_column].values
X_test = test[cols].values
y_test = test[target_column].values
return X_train, X_test, y_train, y_test
cols = select_features(data)
X_train, X_test, y_train, y_test = get_train_test(data, 'price', cols)
train = data[ False == data.is_test ]
test = data[ True == data.is_test ]
In [44]:
def mean_absolute_percentage_error(y_true, y_pred):
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
In [45]:
def quality_func(price, pred_price):
return {
'r2': r2_score(price, pred_price),
'mae': mean_absolute_error(price, pred_price),
'mape': mean_absolute_percentage_error(price, pred_price)
}
In [47]:
mean_price = train['price_m2'].mean()
test.loc[ : , 'pred_price_mean'] = test['area'].map(lambda area: area * mean_price)
quality_func(test['price'].values, test['pred_price_mean'].values)
Out[47]:
In [48]:
district_prices = data[ ['district_map', 'price_m2'] ].groupby('district_map').agg(np.mean).to_dict()['price_m2']
district_prices
test.loc[ : , 'pred_price_mean_district'] = test.apply(lambda x: district_prices[ x['district_map'] ] * x['area'], axis=1)
quality_func(test['price'].values, test['pred_price_mean_district'].values)
Out[48]:
In [49]:
def models():
yield ('extra_tree', ExtraTreesRegressor())
yield ('random_forest', RandomForestRegressor())
yield ('bagging', BaggingRegressor())
yield ('gradient_boos', GradientBoostingRegressor())
yield ('dec_tree', DecisionTreeRegressor())
In [50]:
for name, model in models():
model.fit(X_train, y_train)
key_pred_model = 'pred_price_{0}'.format(name)
test.loc[ : , key_pred_model ] = model.predict(X_test)
print model
print quality_func(test['price'].values, test[key_pred_model].values)
In [77]:
#cols = select_features(data)
#print len(cols)
def run_log_model(train, test, model=ExtraTreesRegressor(), cols=None, output_column='pred_price_log'):
if cols is None:
cols = select_features(data)
X_train = train[cols].values
y_train = train['price_log'].values
X_test = test[cols].values
y_test_log = test['price_log'].values
y_test = test['price'].values
model.fit(X_train, y_train)
y_pred_log = model.predict(X_test)
test.loc[ : , output_column ] = np.exp2(y_pred_log)
return quality_func(y_test, test[output_column].values)
cols = select_features(data)
#print run_log_model(train, test, ExtraTreesRegressor(), cols)
In [78]:
def models():
yield ('extra_tree', ExtraTreesRegressor())
yield ('random_forest', RandomForestRegressor())
yield ('bagging', BaggingRegressor())
yield ('gradient_boos', GradientBoostingRegressor())
yield ('dec_tree', DecisionTreeRegressor())
In [79]:
cols = select_features(data)
for model_name, model in models():
output_column = 'pred_price_log_{0}'.format(model_name)
print model_name
print run_log_model(train, test, model, cols, output_column)
In [85]:
for n_estimators in range(10, 101, 20):
for min_samples_split in [2, 4, 6]:
output_column = 'pred_price_log_extra_tree_tun_{0}_{1}'.format(n_estimators, min_samples_split)
model = ExtraTreesRegressor(n_estimators=n_estimators, min_samples_split=min_samples_split, n_jobs=-1)
print output_column
print run_log_model(train, test, model, cols, output_column)
In [86]:
pred_columns = [ c for c in test.columns.values if 'pred' in c ]
def run_ensemble(test, columns):
test.loc[ : , 'pred_price_ensemble'] = test[pred_columns].apply(np.mean, axis=1)
return quality_func(test['price'].values, test['pred_price_ensemble'].values)
run_ensemble(test, pred_columns)
Out[86]:
In [88]:
test[ ['price', 'pred_price_ensemble'] + pred_columns ].head()
Out[88]:
In [89]:
def get_winner(row):
pred_columns = [ c for c in test.columns.values if 'pred' in c]
min_diff = 1e10
p_col = pred_columns[0]
for p_col in pred_columns:
diff = abs(row['price'] - row[p_col])
if diff < min_diff:
winner = p_col
min_diff = diff
return winner
test.loc[ : , 'winner'] = test.apply(get_winner, axis=1)
test.groupby('winner').count()['market'].reset_index().sort('market', ascending=False)
Out[89]:
In [90]:
pred_columns = [ c for c in test.columns.values if 'pred' in c and c not in ['pred_price_mean', 'pred_price_gradient_boos'] ]
run_ensemble(test, pred_columns)
Out[90]:
In [91]:
for pred_column in [ c for c in test.columns.values if 'pred' in c]:
error_column_name = 'error_{0}'.format(pred_column[len('pred_price_'):])
test.loc[ : , error_column_name] = test['price'] - test[pred_column]
In [92]:
test_thin = test[ [ c for c in test.columns.values if 'error_' in c] + ['price', 'area', 'district_map'] ]
test_melt = pd.melt( test_thin, id_vars=['price','area', 'district_map'] )
test_melt.head()
Out[92]:
In [ ]:
test_melt['value_binary'] = test_melt['value'].map(lambda x: 1 if x >= 0 else 0 )
gsize = theme_matplotlib(rc={"figure.figsize": "20, 40"}, matplotlib_defaults=False)
ggplot(aes(x='price', y='area', colour='value_binary'), test_melt.reset_index() ) \
+ geom_point() \
+ facet_wrap("variable", scales = "free_x") \
+ theme_bw()
In [96]:
gsize = theme_matplotlib(rc={"figure.figsize": "20, 40"}, matplotlib_defaults=False)
ggplot(aes(x='price', y='value', color='district_map'), test_melt) \
+ geom_point() \
+ facet_wrap("variable", scales = "free_x", ncol=3) \
+ geom_hline(color='red', linetype='dotted', yintercept=[-500000, 500000]) \
+ geom_hline(color='red', linetype='solid', yintercept=[-800000, 800000]) \
+ gsize
Out[96]:
In [130]:
df_X, df_y = get_X_y(data)
df_train_X, df_test_X, df_train_y, df_test_y = train_test_split(df_X, df_y)
models_by_district = {}
## learning
for model_name, model in models():
print model_name
model.fit(df_train_X, df_train_y)
models_by_district[model_name] = model
In [142]:
## predicting
predicted_values_by_model = [ model.predict(df_test_X) for model in models_by_district.values() ]
pred_y = [ np.mean(values) for values in zip(*predicted_values_by_model) ]
print quality_func(df_test_y, pred_y)
In [111]:
# Modified from http://scikit-learn.org/stable/auto_examples/plot_learning_curve.html
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
train_sizes=np.linspace(.1, 1.0, 5)):
plt.figure()
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=5, n_jobs=-1, train_sizes=train_sizes, scoring='r2')
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.xlabel("Training examples")
plt.ylabel("Recall")
plt.legend(loc="best")
plt.grid("on")
if ylim:
plt.ylim(ylim)
plt.title(title)
def plot(df, est, title):
X, y = get_X_y(df)
plot_learning_curve(est, title, X, y, ylim=(0.0, 1.01),
train_sizes=np.linspace(.05, 0.2, 5))
In [112]:
plot(data, RandomForestRegressor(n_estimators=10, max_depth=10), 'Random forest 10,10')
In [ ]: