Analysis of AirBnb data

I used the data from InsideAirbnb to provide the data here - we picked Paris as a good example.
The aim is to produce a few models of this phenomenoa.



In [86]:

    
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
import scipy as sp
import pymc3 as pm3
from scipy import stats
from IPython.core.pylabtools import figsize
import os
figsize(12, 12)
sns.set_style('darkgrid')



In [87]:

    
DATA_DIR = os.path.join(os.getcwd(), 'data/')
def p2f(x):
    return float(x.strip('%'))/100



In [88]:

    
data_file = DATA_DIR + 'reviews.csv'
df = pd.read_csv(data_file, sep=',', parse_dates=['date'])
df['listing_id'] = df['listing_id'].astype('category')
df.groupby(['listing_id', 'date']).count()









    



Exception ignored in: <function WeakValueDictionary.__init__.<locals>.remove at 0x1061292f0>
Traceback (most recent call last):
  File "/Users/peadarcoyle/anaconda/envs/sea/lib/python3.4/weakref.py", line 102, in remove
    def remove(wr, selfref=ref(self)):
KeyboardInterrupt






    Out[88]:






  
    
      
      
    
    
      listing_id
      date
    
  
  
    
      2525
      2009-06-30
    
    
      2009-07-03
    
    
      2009-07-08
    
    
      2009-08-14
    
    
      2009-08-27
    
    
      2009-09-05
    
    
      2009-09-06
    
    
      2009-09-10
    
    
      2009-09-13
    
    
      2009-09-28
    
    
      2009-10-05
    
    
      2009-11-20
    
    
      2009-11-27
    
    
      2009-11-29
    
    
      2009-12-02
    
    
      2009-12-18
    
    
      2009-12-20
    
    
      2009-12-26
    
    
      2010-01-01
    
    
      2010-01-02
    
    
      2010-01-03
    
    
      2010-01-04
    
    
      2010-01-11
    
    
      2010-01-13
    
    
      2010-01-18
    
    
      2010-01-20
    
    
      2010-01-24
    
    
      2010-01-26
    
    
      2010-02-07
    
    
      2010-02-09
    
    
      ...
      ...
    
    
      8113735
      2015-08-05
    
    
      2015-08-06
    
    
      2015-08-07
    
    
      2015-08-08
    
    
      2015-08-09
    
    
      2015-08-10
    
    
      2015-08-11
    
    
      2015-08-12
    
    
      2015-08-13
    
    
      2015-08-14
    
    
      2015-08-15
    
    
      2015-08-16
    
    
      2015-08-17
    
    
      2015-08-18
    
    
      2015-08-19
    
    
      2015-08-20
    
    
      2015-08-21
    
    
      2015-08-22
    
    
      2015-08-23
    
    
      2015-08-24
    
    
      2015-08-25
    
    
      2015-08-26
    
    
      2015-08-27
    
    
      2015-08-28
    
    
      2015-08-29
    
    
      2015-08-30
    
    
      2015-08-31
    
    
      2015-09-01
    
    
      2015-09-02
    
    
      2015-09-03
    
  

50501639 rows × 0 columns



In [89]:

    
data_file = DATA_DIR + 'listings.csv'
df_listing = pd.read_csv(data_file, sep=',', na_values='N/A')
df_listing.tail()
df_listing.columns.tolist()









    Out[89]:





['id',
 'listing_url',
 'scrape_id',
 'last_scraped',
 'name',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'neighborhood_overview',
 'notes',
 'transit',
 'thumbnail_url',
 'medium_url',
 'picture_url',
 'xl_picture_url',
 'host_id',
 'host_url',
 'host_name',
 'host_since',
 'host_location',
 'host_about',
 'host_response_time',
 'host_response_rate',
 'host_acceptance_rate',
 'host_is_superhost',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_listings_count',
 'host_total_listings_count',
 'host_verifications',
 'host_has_profile_pic',
 'host_identity_verified',
 'street',
 'neighbourhood',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'zipcode',
 'market',
 'smart_location',
 'country_code',
 'country',
 'latitude',
 'longitude',
 'is_location_exact',
 'property_type',
 'room_type',
 'accommodates',
 'bathrooms',
 'bedrooms',
 'beds',
 'bed_type',
 'amenities',
 'square_feet',
 'price',
 'weekly_price',
 'monthly_price',
 'security_deposit',
 'cleaning_fee',
 'guests_included',
 'extra_people',
 'minimum_nights',
 'maximum_nights',
 'calendar_updated',
 'has_availability',
 'availability_30',
 'availability_60',
 'availability_90',
 'availability_365',
 'calendar_last_scraped',
 'number_of_reviews',
 'first_review',
 'last_review',
 'review_scores_rating',
 'review_scores_accuracy',
 'review_scores_cleanliness',
 'review_scores_checkin',
 'review_scores_communication',
 'review_scores_location',
 'review_scores_value',
 'requires_license',
 'license',
 'jurisdiction_names',
 'instant_bookable',
 'cancellation_policy',
 'require_guest_profile_picture',
 'require_guest_phone_verification',
 'calculated_host_listings_count',
 'reviews_per_month']



In [171]:

    
df_to_model = df_listing[['reviews_per_month', 'cancellation_policy', 'number_of_reviews', 'host_response_rate', 'instant_bookable', 'review_scores_communication']]



In [172]:

    
df_to_model









    Out[172]:






  
    
      
      reviews_per_month
      cancellation_policy
      number_of_reviews
      host_response_rate
      instant_bookable
      review_scores_communication
    
  
  
    
      0
      8.00
      moderate
      52
      100%
      t
      10
    
    
      1
      NaN
      strict
      0
      100%
      f
      NaN
    
    
      2
      3.51
      strict
      28
      96%
      t
      9
    
    
      3
      NaN
      flexible
      0
      NaN
      f
      NaN
    
    
      4
      NaN
      flexible
      0
      100%
      f
      NaN
    
    
      5
      2.93
      moderate
      31
      85%
      f
      10
    
    
      6
      NaN
      flexible
      0
      NaN
      f
      NaN
    
    
      7
      NaN
      flexible
      0
      NaN
      f
      NaN
    
    
      8
      0.17
      strict
      2
      NaN
      f
      10
    
    
      9
      0.18
      moderate
      2
      NaN
      f
      10
    
    
      10
      1.00
      strict
      1
      91%
      f
      10
    
    
      11
      1.11
      flexible
      4
      100%
      f
      10
    
    
      12
      1.04
      strict
      34
      100%
      f
      10
    
    
      13
      2.35
      strict
      14
      57%
      f
      10
    
    
      14
      NaN
      flexible
      0
      NaN
      f
      NaN
    
    
      15
      NaN
      strict
      0
      60%
      f
      NaN
    
    
      16
      0.99
      strict
      28
      80%
      f
      10
    
    
      17
      NaN
      flexible
      0
      60%
      f
      NaN
    
    
      18
      NaN
      moderate
      0
      89%
      f
      NaN
    
    
      19
      0.24
      flexible
      2
      100%
      f
      10
    
    
      20
      NaN
      flexible
      0
      100%
      f
      NaN
    
    
      21
      3.33
      moderate
      5
      88%
      f
      10
    
    
      22
      0.86
      strict
      4
      100%
      f
      10
    
    
      23
      NaN
      flexible
      0
      NaN
      f
      NaN
    
    
      24
      NaN
      strict
      0
      96%
      f
      NaN
    
    
      25
      0.18
      flexible
      2
      100%
      f
      8
    
    
      26
      0.60
      moderate
      1
      100%
      f
      10
    
    
      27
      NaN
      flexible
      0
      NaN
      f
      NaN
    
    
      28
      NaN
      flexible
      0
      NaN
      f
      NaN
    
    
      29
      3.84
      strict
      76
      100%
      f
      10
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      35398
      2.31
      flexible
      7
      100%
      f
      9
    
    
      35399
      NaN
      flexible
      0
      100%
      f
      NaN
    
    
      35400
      1.00
      flexible
      1
      100%
      f
      10
    
    
      35401
      0.34
      moderate
      1
      NaN
      f
      10
    
    
      35402
      9.18
      strict
      15
      100%
      f
      9
    
    
      35403
      NaN
      flexible
      0
      NaN
      f
      NaN
    
    
      35404
      NaN
      flexible
      0
      33%
      f
      NaN
    
    
      35405
      NaN
      moderate
      0
      100%
      t
      NaN
    
    
      35406
      NaN
      flexible
      0
      100%
      f
      NaN
    
    
      35407
      NaN
      flexible
      0
      83%
      f
      NaN
    
    
      35408
      0.19
      flexible
      2
      100%
      t
      10
    
    
      35409
      NaN
      flexible
      0
      NaN
      f
      NaN
    
    
      35410
      NaN
      moderate
      0
      NaN
      f
      NaN
    
    
      35411
      0.25
      moderate
      6
      100%
      f
      10
    
    
      35412
      1.19
      strict
      17
      88%
      f
      10
    
    
      35413
      0.90
      strict
      3
      89%
      f
      8
    
    
      35414
      1.00
      strict
      1
      100%
      f
      10
    
    
      35415
      NaN
      flexible
      0
      100%
      f
      NaN
    
    
      35416
      0.23
      strict
      3
      72%
      f
      9
    
    
      35417
      NaN
      flexible
      0
      100%
      f
      NaN
    
    
      35418
      1.00
      flexible
      1
      100%
      f
      10
    
    
      35419
      1.00
      flexible
      1
      100%
      f
      10
    
    
      35420
      1.43
      strict
      5
      60%
      f
      10
    
    
      35421
      NaN
      flexible
      0
      NaN
      f
      NaN
    
    
      35422
      NaN
      flexible
      0
      100%
      f
      NaN
    
    
      35423
      1.30
      flexible
      4
      100%
      f
      10
    
    
      35424
      1.00
      strict
      1
      72%
      f
      NaN
    
    
      35425
      NaN
      flexible
      0
      100%
      f
      NaN
    
    
      35426
      NaN
      flexible
      0
      100%
      f
      NaN
    
    
      35427
      NaN
      strict
      0
      93%
      f
      NaN
    
  

35428 rows × 6 columns



In [ ]:

    
df_to_model['reviews_per_month'] = df_to_model['reviews_per_month'].fillna(0)
df_to_model['host_response_rate'] = df_to_model['host_response_rate'].fillna(0)
df_to_model['review_scores_communication'] = df_to_model['review_scores_communication'].fillna(0)
df_to_model['instant_bookable'] = df_to_model['instant_bookable'].astype('category')
df_to_model['reviews_per_month'] = df_to_model['reviews_per_month'].astype('int32')









    



/Users/peadarcoyle/anaconda/envs/sea/lib/python3.4/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/Users/peadarcoyle/anaconda/envs/sea/lib/python3.4/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app



In [ ]:

    
df_to_model['cancellation_policy'] = df_to_model['cancellation_policy'].astype('category')



In [ ]:

    
df_to_model['host_response_rate'] = df_to_model['host_response_rate'].astype('str')
df_to_model['host_response_rate'] = df_to_model['host_response_rate'].apply(p2f)



In [ ]:

    
df_to_model



In [ ]:

    
y = df_listing['availability_365'].values
df_to_model



In [96]:

    
df_to_model = pd.get_dummies(df_to_model)



In [109]:

    
X = df_to_model.values

Let us look at one of the columns.
We see below that we have a very large variety of numbers, let us apply next a preprocessing step.



In [119]:

    
X[:,1]









    Out[119]:





array([ 52.,   0.,  28., ...,   0.,   0.,   0.])



In [120]:

    
from sklearn import preprocessing
X[:,1] = preprocessing.scale(X[:,1])



In [121]:

    
X









    Out[121]:





array([[ 8.        ,  1.78244273,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.52226535,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 3.        ,  0.71873131,  0.96      , ...,  1.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        , -0.52226535,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.52226535,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.52226535,  0.93      , ...,  1.        ,
         0.        ,  0.        ]])



In [151]:

    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [152]:

    
names = df_to_model.columns.tolist()



In [153]:

    
X_train









    Out[153]:





array([[ 0.        , -0.47794405,  0.3       , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.43362274,  1.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.        , -0.52226535,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.        , -0.47794405,  0.1       , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        , -0.43362274,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 3.        ,  0.36416083,  0.97      , ...,  0.        ,
         0.        ,  0.        ]])



In [154]:

    
features = df_to_model.columns
features = np.asarray(features)



In [ ]:



In [155]:

    
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
figsize(6,6)
alpha = 0.1
lasso = Lasso(alpha=alpha)

y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)
print(lasso)
print("r^2 on test data : %f" % r2_score_lasso)

###############################################################################
# ElasticNet
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)

plt.plot(enet.coef_, label='Elastic net coefficients')
plt.plot(lasso.coef_, label='Lasso coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
          % (r2_score_lasso, r2_score_enet))
plt.show()









    



Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
r^2 on test data : 0.036396
ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.7,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
r^2 on test data : 0.036488



In [170]:

    
from sklearn import ensemble
###############################################################################
# Fit regression model
params = {'n_estimators': 1000, 'max_depth': 20, 'min_samples_split': 17, 'max_features': 1.0,
          'learning_rate': 0.01, 'loss': 'huber'}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

###############################################################################
# Plot training deviance

# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

for i, y_pred in enumerate(clf.staged_predict(X_test)):
    test_score[i] = clf.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')









    



MSE: 121.6835






    Out[170]:





<matplotlib.text.Text at 0x413b66908>



In [157]:

    
###############################################################################
# Plot feature importance
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, names)
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()



In [102]:

    
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)









    



MSE: 109.7886



In [161]:

    
features=[0,1,2,3,4,5]
fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
                                   n_jobs=3, grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
             'for the airbnb paris dataset')
plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle



In [ ]:



In [ ]:


listing_id	date
2525	2009-06-30
	2009-07-03
	2009-07-08
	2009-08-14
	2009-08-27
	2009-09-05
	2009-09-06
	2009-09-10
	2009-09-13
	2009-09-28
	2009-10-05
	2009-11-20
	2009-11-27
	2009-11-29
	2009-12-02
	2009-12-18
	2009-12-20
	2009-12-26
	2010-01-01
	2010-01-02
	2010-01-03
	2010-01-04
	2010-01-11
	2010-01-13
	2010-01-18
	2010-01-20
	2010-01-24
	2010-01-26
	2010-02-07
	2010-02-09
...	...
8113735	2015-08-05
	2015-08-06
	2015-08-07
	2015-08-08
	2015-08-09
	2015-08-10
	2015-08-11
	2015-08-12
	2015-08-13
	2015-08-14
	2015-08-15
	2015-08-16
	2015-08-17
	2015-08-18
	2015-08-19
	2015-08-20
	2015-08-21
	2015-08-22
	2015-08-23
	2015-08-24
	2015-08-25
	2015-08-26
	2015-08-27
	2015-08-28
	2015-08-29
	2015-08-30
	2015-08-31
	2015-09-01
	2015-09-02
	2015-09-03

	reviews_per_month	cancellation_policy	number_of_reviews	host_response_rate	instant_bookable	review_scores_communication
0	8.00	moderate	52	100%	t	10
1	NaN	strict	0	100%	f	NaN
2	3.51	strict	28	96%	t	9
3	NaN	flexible	0	NaN	f	NaN
4	NaN	flexible	0	100%	f	NaN
5	2.93	moderate	31	85%	f	10
6	NaN	flexible	0	NaN	f	NaN
7	NaN	flexible	0	NaN	f	NaN
8	0.17	strict	2	NaN	f	10
9	0.18	moderate	2	NaN	f	10
10	1.00	strict	1	91%	f	10
11	1.11	flexible	4	100%	f	10
12	1.04	strict	34	100%	f	10
13	2.35	strict	14	57%	f	10
14	NaN	flexible	0	NaN	f	NaN
15	NaN	strict	0	60%	f	NaN
16	0.99	strict	28	80%	f	10
17	NaN	flexible	0	60%	f	NaN
18	NaN	moderate	0	89%	f	NaN
19	0.24	flexible	2	100%	f	10
20	NaN	flexible	0	100%	f	NaN
21	3.33	moderate	5	88%	f	10
22	0.86	strict	4	100%	f	10
23	NaN	flexible	0	NaN	f	NaN
24	NaN	strict	0	96%	f	NaN
25	0.18	flexible	2	100%	f	8
26	0.60	moderate	1	100%	f	10
27	NaN	flexible	0	NaN	f	NaN
28	NaN	flexible	0	NaN	f	NaN
29	3.84	strict	76	100%	f	10
...	...	...	...	...	...	...
35398	2.31	flexible	7	100%	f	9
35399	NaN	flexible	0	100%	f	NaN
35400	1.00	flexible	1	100%	f	10
35401	0.34	moderate	1	NaN	f	10
35402	9.18	strict	15	100%	f	9
35403	NaN	flexible	0	NaN	f	NaN
35404	NaN	flexible	0	33%	f	NaN
35405	NaN	moderate	0	100%	t	NaN
35406	NaN	flexible	0	100%	f	NaN
35407	NaN	flexible	0	83%	f	NaN
35408	0.19	flexible	2	100%	t	10
35409	NaN	flexible	0	NaN	f	NaN
35410	NaN	moderate	0	NaN	f	NaN
35411	0.25	moderate	6	100%	f	10
35412	1.19	strict	17	88%	f	10
35413	0.90	strict	3	89%	f	8
35414	1.00	strict	1	100%	f	10
35415	NaN	flexible	0	100%	f	NaN
35416	0.23	strict	3	72%	f	9
35417	NaN	flexible	0	100%	f	NaN
35418	1.00	flexible	1	100%	f	10
35419	1.00	flexible	1	100%	f	10
35420	1.43	strict	5	60%	f	10
35421	NaN	flexible	0	NaN	f	NaN
35422	NaN	flexible	0	100%	f	NaN
35423	1.30	flexible	4	100%	f	10
35424	1.00	strict	1	72%	f	NaN
35425	NaN	flexible	0	100%	f	NaN
35426	NaN	flexible	0	100%	f	NaN
35427	NaN	strict	0	93%	f	NaN