Analysis of AirBnb data

  • I used the data from InsideAirbnb to provide the data here - we picked Paris as a good example.
  • The aim is to produce a few models of this phenomenoa.

In [86]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
import scipy as sp
import pymc3 as pm3
from scipy import stats
from IPython.core.pylabtools import figsize
import os
figsize(12, 12)
sns.set_style('darkgrid')

In [87]:
DATA_DIR = os.path.join(os.getcwd(), 'data/')
def p2f(x):
    return float(x.strip('%'))/100

In [88]:
data_file = DATA_DIR + 'reviews.csv'
df = pd.read_csv(data_file, sep=',', parse_dates=['date'])
df['listing_id'] = df['listing_id'].astype('category')
df.groupby(['listing_id', 'date']).count()


Exception ignored in: <function WeakValueDictionary.__init__.<locals>.remove at 0x1061292f0>
Traceback (most recent call last):
  File "/Users/peadarcoyle/anaconda/envs/sea/lib/python3.4/weakref.py", line 102, in remove
    def remove(wr, selfref=ref(self)):
KeyboardInterrupt
Out[88]:
listing_id date
2525 2009-06-30
2009-07-03
2009-07-08
2009-08-14
2009-08-27
2009-09-05
2009-09-06
2009-09-10
2009-09-13
2009-09-28
2009-10-05
2009-11-20
2009-11-27
2009-11-29
2009-12-02
2009-12-18
2009-12-20
2009-12-26
2010-01-01
2010-01-02
2010-01-03
2010-01-04
2010-01-11
2010-01-13
2010-01-18
2010-01-20
2010-01-24
2010-01-26
2010-02-07
2010-02-09
... ...
8113735 2015-08-05
2015-08-06
2015-08-07
2015-08-08
2015-08-09
2015-08-10
2015-08-11
2015-08-12
2015-08-13
2015-08-14
2015-08-15
2015-08-16
2015-08-17
2015-08-18
2015-08-19
2015-08-20
2015-08-21
2015-08-22
2015-08-23
2015-08-24
2015-08-25
2015-08-26
2015-08-27
2015-08-28
2015-08-29
2015-08-30
2015-08-31
2015-09-01
2015-09-02
2015-09-03

50501639 rows × 0 columns


In [89]:
data_file = DATA_DIR + 'listings.csv'
df_listing = pd.read_csv(data_file, sep=',', na_values='N/A')
df_listing.tail()
df_listing.columns.tolist()


Out[89]:
['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'calendar_updated',
 'has_availability',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'calendar_last_scraped',
 'number_of_reviews',
 'first_review',
 'last_review',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'requires_license',
 'license',
 'jurisdiction_names',
 'instant_bookable',
 'cancellation_policy',
 'require_guest_profile_picture',
 'require_guest_phone_verification',
 'calculated_host_listings_count',
 'reviews_per_month']

In [171]:
df_to_model = df_listing[['reviews_per_month', 'cancellation_policy', 'number_of_reviews', 'host_response_rate', 'instant_bookable', 'review_scores_communication']]

In [172]:
df_to_model


Out[172]:
reviews_per_month cancellation_policy number_of_reviews host_response_rate instant_bookable review_scores_communication
0 8.00 moderate 52 100% t 10
1 NaN strict 0 100% f NaN
2 3.51 strict 28 96% t 9
3 NaN flexible 0 NaN f NaN
4 NaN flexible 0 100% f NaN
5 2.93 moderate 31 85% f 10
6 NaN flexible 0 NaN f NaN
7 NaN flexible 0 NaN f NaN
8 0.17 strict 2 NaN f 10
9 0.18 moderate 2 NaN f 10
10 1.00 strict 1 91% f 10
11 1.11 flexible 4 100% f 10
12 1.04 strict 34 100% f 10
13 2.35 strict 14 57% f 10
14 NaN flexible 0 NaN f NaN
15 NaN strict 0 60% f NaN
16 0.99 strict 28 80% f 10
17 NaN flexible 0 60% f NaN
18 NaN moderate 0 89% f NaN
19 0.24 flexible 2 100% f 10
20 NaN flexible 0 100% f NaN
21 3.33 moderate 5 88% f 10
22 0.86 strict 4 100% f 10
23 NaN flexible 0 NaN f NaN
24 NaN strict 0 96% f NaN
25 0.18 flexible 2 100% f 8
26 0.60 moderate 1 100% f 10
27 NaN flexible 0 NaN f NaN
28 NaN flexible 0 NaN f NaN
29 3.84 strict 76 100% f 10
... ... ... ... ... ... ...
35398 2.31 flexible 7 100% f 9
35399 NaN flexible 0 100% f NaN
35400 1.00 flexible 1 100% f 10
35401 0.34 moderate 1 NaN f 10
35402 9.18 strict 15 100% f 9
35403 NaN flexible 0 NaN f NaN
35404 NaN flexible 0 33% f NaN
35405 NaN moderate 0 100% t NaN
35406 NaN flexible 0 100% f NaN
35407 NaN flexible 0 83% f NaN
35408 0.19 flexible 2 100% t 10
35409 NaN flexible 0 NaN f NaN
35410 NaN moderate 0 NaN f NaN
35411 0.25 moderate 6 100% f 10
35412 1.19 strict 17 88% f 10
35413 0.90 strict 3 89% f 8
35414 1.00 strict 1 100% f 10
35415 NaN flexible 0 100% f NaN
35416 0.23 strict 3 72% f 9
35417 NaN flexible 0 100% f NaN
35418 1.00 flexible 1 100% f 10
35419 1.00 flexible 1 100% f 10
35420 1.43 strict 5 60% f 10
35421 NaN flexible 0 NaN f NaN
35422 NaN flexible 0 100% f NaN
35423 1.30 flexible 4 100% f 10
35424 1.00 strict 1 72% f NaN
35425 NaN flexible 0 100% f NaN
35426 NaN flexible 0 100% f NaN
35427 NaN strict 0 93% f NaN

35428 rows × 6 columns


In [ ]:
df_to_model['reviews_per_month'] = df_to_model['reviews_per_month'].fillna(0)
df_to_model['host_response_rate'] = df_to_model['host_response_rate'].fillna(0)
df_to_model['review_scores_communication'] = df_to_model['review_scores_communication'].fillna(0)
df_to_model['instant_bookable'] = df_to_model['instant_bookable'].astype('category')
df_to_model['reviews_per_month'] = df_to_model['reviews_per_month'].astype('int32')


/Users/peadarcoyle/anaconda/envs/sea/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/Users/peadarcoyle/anaconda/envs/sea/lib/python3.4/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [ ]:
df_to_model['cancellation_policy'] = df_to_model['cancellation_policy'].astype('category')

In [ ]:
df_to_model['host_response_rate'] = df_to_model['host_response_rate'].astype('str')
df_to_model['host_response_rate'] = df_to_model['host_response_rate'].apply(p2f)

In [ ]:
df_to_model

In [ ]:
y = df_listing['availability_365'].values
df_to_model

In [96]:
df_to_model = pd.get_dummies(df_to_model)

In [109]:
X = df_to_model.values
  • Let us look at one of the columns.
  • We see below that we have a very large variety of numbers, let us apply next a preprocessing step.

In [119]:
X[:,1]


Out[119]:
array([ 52.,   0.,  28., ...,   0.,   0.,   0.])

In [120]:
from sklearn import preprocessing
X[:,1] = preprocessing.scale(X[:,1])

In [121]:
X


Out[121]:
array([[ 8.        ,  1.78244273,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.52226535,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 3.        ,  0.71873131,  0.96      , ...,  1.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        , -0.52226535,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.52226535,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.52226535,  0.93      , ...,  1.        ,
         0.        ,  0.        ]])

In [151]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [152]:
names = df_to_model.columns.tolist()

In [153]:
X_train


Out[153]:
array([[ 0.        , -0.47794405,  0.3       , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.43362274,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        , -0.52226535,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        , -0.47794405,  0.1       , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.43362274,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 3.        ,  0.36416083,  0.97      , ...,  0.        ,
         0.        ,  0.        ]])

In [154]:
features = df_to_model.columns
features = np.asarray(features)

In [ ]:


In [155]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
figsize(6,6)
alpha = 0.1
lasso = Lasso(alpha=alpha)

y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)
print(lasso)
print("r^2 on test data : %f" % r2_score_lasso)

###############################################################################
# ElasticNet
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)

plt.plot(enet.coef_, label='Elastic net coefficients')
plt.plot(lasso.coef_, label='Lasso coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
          % (r2_score_lasso, r2_score_enet))
plt.show()


Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
r^2 on test data : 0.036396
ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.7,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
r^2 on test data : 0.036488

In [170]:
from sklearn import ensemble
###############################################################################
# Fit regression model
params = {'n_estimators': 1000, 'max_depth': 20, 'min_samples_split': 17, 'max_features': 1.0,
          'learning_rate': 0.01, 'loss': 'huber'}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

###############################################################################
# Plot training deviance

# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(clf.staged_predict(X_test)):
    test_score[i] = clf.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')


MSE: 121.6835
Out[170]:
<matplotlib.text.Text at 0x413b66908>

In [157]:
###############################################################################
# Plot feature importance
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, names)
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()



In [102]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)


MSE: 109.7886

In [161]:
features=[0,1,2,3,4,5]
fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
                                   n_jobs=3, grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
             'for the airbnb paris dataset')
plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle



In [ ]:


In [ ]: