In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_rows', 20)
pd.set_option('display.precision', 4)
%matplotlib inline
In [2]:
bookings = pd.read_csv('../data/bookings.csv', parse_dates = ['booking_date'])
listings = pd.read_csv('../data/listings.csv')
In [3]:
print "Mean of the variables of interests"
listings[['price', 'person_capacity', 'picture_count', 'description_length', 'tenure_months']].mean()
Out[3]:
In [4]:
print "Standard deviation of the variables of interest"
listings[['price', 'person_capacity', 'picture_count', 'description_length', 'tenure_months']].std()
Out[4]:
In [5]:
print "Median of the variables of interest"
listings[['price', 'person_capacity', 'picture_count', 'description_length', 'tenure_months']].median()
Out[5]:
In [6]:
print 'Mean price, person capacity, picture count, description length, and tenure by property type'
listings.groupby('prop_type')['price', 'person_capacity', 'picture_count','description_length', 'tenure_months'].mean()
Out[6]:
In [7]:
print 'Property type 1: Mean price, person capacity, picture count, description length, and tenure by neighborhood'
listings[listings.prop_type == 'Property type 1'].groupby('neighborhood')['price', 'person_capacity', 'picture_count','description_length', 'tenure_months'].mean().sort()
Out[7]:
In [8]:
print 'Property type 2: Mean price, person capacity, picture count, description length, and tenure by neighborhood'
listings[listings.prop_type == 'Property type 2'].groupby('neighborhood')['price', 'person_capacity', 'picture_count','description_length', 'tenure_months'].mean().sort()
Out[8]:
In [9]:
print 'Property type 3: Mean price, person capacity, picture count, description length, and tenure by neighborhood'
listings[listings.prop_type == 'Property type 3'].groupby('neighborhood')['price', 'person_capacity', 'picture_count','description_length', 'tenure_months'].mean()
Out[9]:
In [10]:
plot1 = bookings.booking_date.value_counts().plot(figsize = (25,10))
plot1.set_title('Daily bookings')
plot1.set_xlabel('Date')
plot1.set_ylabel('Number of bookings')
Out[10]:
In [11]:
lb = listings.merge(bookings)
In [12]:
lb['day_of_week'] = lb.booking_date.map(lambda x: x.dayofweek)
In [13]:
def day_word(number):
day_of_week = {}
day_of_week[0] = 'Monday'
day_of_week[1] = 'Tuesday'
day_of_week[2] = 'Wednesday'
day_of_week[3] = 'Thursday'
day_of_week[4] = 'Friday'
day_of_week[5] = 'Saturday'
day_of_week[6] = 'Sunday'
return day_of_week[number]
In [14]:
lb['day'] = lb['day_of_week'].map(day_word)
In [15]:
# Warning: Super unpretty graph to follow. :( But I was told not to worry. :)
lb.groupby(['neighborhood', 'booking_date'])['prop_id'].count().unstack(0).plot(figsize = (20, 10))
plt.title('Daily bookings by neighborhood')
plt.xlabel('Date')
plt.ylabel('Number of bookings')
plt.ylim([0, 14]);
In [16]:
# Here's a prettier plot. But it uses day of the week on x axis.
# I could not figure out how to re-order x axis labels ...
lb.groupby(['neighborhood', 'day'])['prop_id'].count().unstack(0).plot(figsize = (20, 10))
plt.title('Daily bookings by neighborhood')
plt.xlabel('Day of the week')
plt.ylabel('Number of bookings')
plt.ylim([0, 35]);
In [17]:
# Not all listings had bookings.
# Function to get a dict with prop_id as key and number of bookings as value.
bookedProperties = {}
for prop in bookings['prop_id']:
if prop in bookedProperties:
bookedProperties[prop] += 1
else:
bookedProperties[prop] = 1
In [18]:
# Function to get number of bookings if bookings were made, set to zero otherwise.
def bookings_map(propertyid):
if propertyid not in bookedProperties.keys():
return 0
else:
return bookedProperties[propertyid]
In [19]:
# Apply function.
listings['number_of_bookings'] = listings.prop_id.map(bookings_map)
listings['booking_rate'] = listings.number_of_bookings / listings.tenure_months
In [20]:
listings_new = listings[listings.tenure_months >= 10]
In [21]:
# Recode categories in prop_type so that dummy variables can be called later (i.e., whitespace to underscore)
# Chad, Ramesh - any ideas why I keep getting the error message (when running it the first time)?
listings_new['prop_typeR'] = listings_new.loc[:, 'prop_type'].map(lambda x: x.replace(' ', '_'))
listings_new['neighborhoodR'] = listings_new.loc[:, 'neighborhood'].map(lambda x: x.replace(' ', '_'))
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [22]:
dummies = pd.get_dummies(listings_new[['neighborhoodR', 'prop_typeR']], prefix = ['dummy', 'dummy'])
In [23]:
# Join with listings_new.
listings_with_dummies = listings_new.join(dummies)
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [24]:
from sklearn.cross_validation import train_test_split
In [25]:
predictor = np.array(listings_with_dummies['booking_rate'])
In [26]:
listings_reduced = listings_with_dummies.drop(['prop_id', 'booking_rate', 'prop_type', 'neighborhood', 'number_of_bookings', \
'prop_typeR', 'neighborhoodR'], \
axis = 1)
In [27]:
regressors_train, regressors_test, predictor_train, predictor_test = train_test_split(listings_reduced,\
predictor, \
test_size = 0.2,\
random_state = 12)
In [28]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
In [29]:
lr.fit(regressors_train, predictor_train)
Out[29]:
In [30]:
# Print R Squared. It is the variance explained.
print 'Variance score of test data: ', lr.score(regressors_test, predictor_test)
In [31]:
predicted = lr.predict(regressors_test)
expected = predictor_test
In [32]:
fig, ax = plt.subplots(1, 1)
ax.scatter(expected, predicted)
ax.plot(predictor, predictor, 'r')
ax.axis('tight')
ax.set_xlabel('Observed booking rate')
ax.set_ylabel('Predicted booking rate')
print "Residual mean squared error of the regression:", np.sqrt(np.mean((predicted - expected) ** 2))
This mean that 23% of the variance in the test data can be explained by the model trained on the train data. R squared is the explained variation divided by the total variation. Higher scores mean that more variation is explained by the model. This suggests that the model that was fitted is not that bad (in studies involving human behavior most studies report explained variance to be below 50%), but it probably would be better to select features to feed into the model and not enter it all. With only 144 individual data points, the number of regressors (here: 24) is also pretty large.
In [33]:
# Copy orig data.
listings_copy = listings_new.copy()
In [34]:
# Compute new variable of interest.
listings_copy['monthly_revenue'] = (listings_copy.loc[:, 'number_of_bookings'] * listings_copy.loc[:, 'price']) / (listings_copy.loc[:, 'tenure_months'])
In [35]:
# Get dummy variables. Join with data copy.
dummies2 = pd.get_dummies(listings_copy[['neighborhoodR', 'prop_typeR']], prefix = ['dummy', 'dummy'])
listings_copy_dummies = listings_copy.join(dummies2)
In [36]:
# Predictor np array. Drop unwanted variables.
predictor2 = np.array(listings_copy_dummies['monthly_revenue'])
listings_copy_reduced = listings_copy_dummies.drop(['prop_id', 'booking_rate', 'prop_type', 'neighborhood', 'number_of_bookings', 'prop_typeR', 'neighborhoodR', 'monthly_revenue'], axis = 1)
In [37]:
# Split.
regr_train, regr_test, pred_train, pred_test = train_test_split(listings_copy_reduced, predictor2, test_size = 0.2, random_state = 9)
In [38]:
lr2 = LinearRegression()
In [39]:
lr2.fit(regr_train, pred_train)
Out[39]:
In [40]:
print 'R squared: ', lr2.score(regr_test, pred_test)
print 'Booooh, a horrible model (negative means that the model fitted is worse than a simple horizontal line).'
In [41]:
predicted2 = lr2.predict(regr_test)
expected2 = pred_test
In [42]:
fig, ax = plt.subplots(1, 1)
ax.scatter(expected2, predicted2)
ax.plot(predictor2, predictor2, 'r')
ax.axis('tight')
ax.set_xlabel('Observed monthly revenue (USD)')
ax.set_ylabel('Predicted monthly revenue (USD)')
print 'Residual mean squared error of the regression:', np.sqrt(np.mean((predicted2 - expected2) ** 2))
In [43]:
# I tried some transformations. Log transform did not normalize predictor2.
# I went for sqrt transform - even though it still didn't look Gaussian.
predictor2_sqrt = np.sqrt(predictor2)
# Split again.
regr2_train, regr2_test, pred2_train, pred2_test = train_test_split(listings_copy_reduced, predictor2_sqrt, test_size = 0.2, random_state = 9)
# Fit.
lr3 = LinearRegression()
lr3.fit(regr2_train, pred2_train)
# Look at score.
print 'R squared: ', lr3.score(regr2_test, pred2_test)
print 'Ha, made it worse. Thoughts?'
In [43]: