In [86]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import itertools
import scipy as sp
import pymc3 as pm3
from scipy import stats
from IPython.core.pylabtools import figsize
import os
figsize(12, 12)
sns.set_style('darkgrid')
In [87]:
DATA_DIR = os.path.join(os.getcwd(), 'data/')
def p2f(x):
return float(x.strip('%'))/100
In [88]:
data_file = DATA_DIR + 'reviews.csv'
df = pd.read_csv(data_file, sep=',', parse_dates=['date'])
df['listing_id'] = df['listing_id'].astype('category')
df.groupby(['listing_id', 'date']).count()
Out[88]:
In [89]:
data_file = DATA_DIR + 'listings.csv'
df_listing = pd.read_csv(data_file, sep=',', na_values='N/A')
df_listing.tail()
df_listing.columns.tolist()
Out[89]:
In [171]:
df_to_model = df_listing[['reviews_per_month', 'cancellation_policy', 'number_of_reviews', 'host_response_rate', 'instant_bookable', 'review_scores_communication']]
In [172]:
df_to_model
Out[172]:
In [ ]:
df_to_model['reviews_per_month'] = df_to_model['reviews_per_month'].fillna(0)
df_to_model['host_response_rate'] = df_to_model['host_response_rate'].fillna(0)
df_to_model['review_scores_communication'] = df_to_model['review_scores_communication'].fillna(0)
df_to_model['instant_bookable'] = df_to_model['instant_bookable'].astype('category')
df_to_model['reviews_per_month'] = df_to_model['reviews_per_month'].astype('int32')
In [ ]:
df_to_model['cancellation_policy'] = df_to_model['cancellation_policy'].astype('category')
In [ ]:
df_to_model['host_response_rate'] = df_to_model['host_response_rate'].astype('str')
df_to_model['host_response_rate'] = df_to_model['host_response_rate'].apply(p2f)
In [ ]:
df_to_model
In [ ]:
y = df_listing['availability_365'].values
df_to_model
In [96]:
df_to_model = pd.get_dummies(df_to_model)
In [109]:
X = df_to_model.values
In [119]:
X[:,1]
Out[119]:
In [120]:
from sklearn import preprocessing
X[:,1] = preprocessing.scale(X[:,1])
In [121]:
X
Out[121]:
In [151]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [152]:
names = df_to_model.columns.tolist()
In [153]:
X_train
Out[153]:
In [154]:
features = df_to_model.columns
features = np.asarray(features)
In [ ]:
In [155]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
figsize(6,6)
alpha = 0.1
lasso = Lasso(alpha=alpha)
y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)
print(lasso)
print("r^2 on test data : %f" % r2_score_lasso)
###############################################################################
# ElasticNet
from sklearn.linear_model import ElasticNet
enet = ElasticNet(alpha=alpha, l1_ratio=0.7)
y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)
plt.plot(enet.coef_, label='Elastic net coefficients')
plt.plot(lasso.coef_, label='Lasso coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
% (r2_score_lasso, r2_score_enet))
plt.show()
In [170]:
from sklearn import ensemble
###############################################################################
# Fit regression model
params = {'n_estimators': 1000, 'max_depth': 20, 'min_samples_split': 17, 'max_features': 1.0,
'learning_rate': 0.01, 'loss': 'huber'}
clf = ensemble.GradientBoostingRegressor(**params)
clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)
###############################################################################
# Plot training deviance
# compute test set deviance
test_score = np.zeros((params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(clf.staged_predict(X_test)):
test_score[i] = clf.loss_(y_test, y_pred)
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.title('Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',
label='Training Set Deviance')
plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',
label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
Out[170]:
In [157]:
###############################################################################
# Plot feature importance
feature_importance = clf.feature_importances_
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, names)
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()
In [102]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)
In [161]:
features=[0,1,2,3,4,5]
fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
n_jobs=3, grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
'for the airbnb paris dataset')
plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle
In [ ]:
In [ ]: