In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [3]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# This enables inline Plots
%matplotlib inline
# Limit rows displayed in notebook
pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 2)
In [4]:
pd.__version__
Out[4]:
In [5]:
listings = pd.read_csv('../data/listings.csv')
bookings = pd.read_csv('../data/bookings.csv')
In [6]:
listings.head(5)
Out[6]:
In [7]:
listings.info()
In [8]:
bookings.head(5)
Out[8]:
In [9]:
bookings.info()
In [10]:
listings.groupby('prop_type').prop_id.count()
Out[10]:
In [11]:
bookings.groupby('prop_id').count()
Out[11]:
In [12]:
print 'price', listings.price.mean(), '/', listings.price.median(), '/', listings.price.std()
print 'capacity', listings.person_capacity.mean(), '/', listings.person_capacity.median(), '/', listings.person_capacity.std()
print 'picture count', listings.picture_count.mean(), '/', listings.picture_count.median(), '/', listings.picture_count.std()
print 'desc length', listings.description_length.mean(), '/', listings.description_length.median(), '/', listings.description_length.std()
print 'tenure, in months', listings.tenure_months.mean(), '/', listings.tenure_months.median(), '/', listings.tenure_months.std()
In [13]:
print 'price by', listings.groupby('prop_type').price.mean()
print 'capacity by', listings.groupby('prop_type').person_capacity.mean()
print 'pictures by', listings.groupby('prop_type').picture_count.mean()
print 'description length by', listings.groupby('prop_type').description_length.mean()
print 'tenure by', listings.groupby('prop_type').tenure_months.mean()
In [14]:
neighborhood_price = listings.groupby(['neighborhood','prop_type']).price.mean()
neighborhood_price.unstack(1)
Out[14]:
In [15]:
bookings.booking_date = pd.to_datetime(bookings.booking_date)
bookings.book_date = bookings.booking_date.map(lambda x: x.date())
bookings.book_date[0]
bookings_by_date = bookings.groupby('booking_date').count()
In [16]:
bookings_by_date.head(5)
Out[16]:
In [17]:
bookings_by_date.tail(5)
Out[17]:
In [18]:
bookings_by_date.info()
In [20]:
bookings_by_date.hist()
Out[20]:
In [150]:
# wait maybe we aren't supposed to build a histogram -- we may want a line chart / time series instead?
bookings_by_date.unstack()
bookings_by_date.rename(columns={'prop_id':'bookings'})
Out[150]:
In [148]:
bookings_by_date.plot()
Out[148]:
In [22]:
# merge listings data into bookings (by prop_id)
bookinginfo = bookings.merge(listings)
bookinginfo.info()
In [25]:
bookinginfo.head(5)
Out[25]:
In [251]:
neighbookinfo = bookinginfo.groupby(bookinginfo.booking_date)
neighbookinfo.head(5)
Out[251]:
In [265]:
neigh_new = bookinginfo[['booking_date','neighborhood']]
In [267]:
neigh_new.head(5)
Out[267]:
In [282]:
neigh_new.groupby(neigh_new.booking_date).count()
neigh_new.plot(x=neigh_new.booking_date, y=neigh_new.count(), by=neigh_new.neighborhood)
# I just can't seem to quite wrangle this dataframe into useable form
In [50]:
# group bookings dataframe by prop_id and merge into listings
# calculate booking_rate
bookrate = bookings.groupby('prop_id').count()
bookrate.rename(columns={'booking_date':'times_booked'}, inplace=True)
bookrate.info()
In [46]:
bookrate.head(5)
Out[46]:
In [62]:
bookrate.info()
In [63]:
bookrate.reset_index(inplace=True)
In [66]:
listinginfo = listings.merge(bookrate, on='prop_id', how='left')
In [74]:
listinginfo['booking_rate'] = (listinginfo.times_booked / listinginfo.tenure_months)
In [75]:
listinginfo.info()
In [224]:
srlistings = listinginfo[listinginfo.tenure_months>=10]
srlistings.info()
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [225]:
srlistings.groupby('prop_type').prop_id.count()
Out[225]:
In [226]:
# need to construct dummy variables in pandas 0.14.1
prop_type_dummies = pd.get_dummies(srlistings.prop_type)
neighborhood_dummies = pd.get_dummies(srlistings.neighborhood)
srlistings = srlistings.merge(prop_type_dummies,left_index=True, right_index=True)
srlistings = srlistings.merge(neighborhood_dummies, left_index=True, right_index=True)
In [230]:
srlistings.drop(['prop_id','prop_type','neighborhood'], axis=1, inplace=True)
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [158]:
from sklearn.cross_validation import train_test_split
In [233]:
#modellistings = srlistings.ix[::,['price','person_capacity','picture_count','description_length','tenure_months']]
modellistings = srlistings.drop(['times_booked','booking_rate'], axis=1)
In [234]:
modellistings.info()
In [235]:
X_train, X_test, y_train, y_test = train_test_split(modellistings,srlistings.booking_rate, test_size=.3)
In [236]:
modellistings.sort_index(by='tenure_months')
Out[236]:
In [237]:
y_train = np.nan_to_num(y_train)
In [238]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
In [170]:
#import matplotlib.pylab as plt
#from sklearn.preprocessing import PolynomialFeatures
#from sklearn.pipeline import make_pipeline
#from IPython.core.pylabtools import figsize
#figsize(5,5)
#plt.style.use('fivethirtyeight')
In [239]:
lr.fit(X_train,y_train)
print('Coefficients: \n', lr.coef_)
In [243]:
modellistings.info()
In [240]:
lr.predict(X_test)
Out[240]:
In [241]:
lr.score(X_train, y_train, sample_weight=None)
Out[241]:
Estimates the variation in booking rate explained by the regressors (less than 20%).
. Other factors than price / size / pics / desc / tenure may be important (e.g. rating, responsiveness) . Some of the selected factors (e.g. tenure) may be unimportant and thus add noise to the model . Still need to add category, neighborhood dummies .
In [ ]:
# Simplify the model: remove neighborhoods, tenure
In [246]:
newmodellistings = modellistings.ix[::,['price','person_capacity','picture_count','description_length','Property type 1','Property type 2','Property type 3']]
X_train, X_test, y_train, y_test = train_test_split(newmodellistings,srlistings.booking_rate, test_size=.3)
In [248]:
y_train = np.nan_to_num(y_train)
lr.fit(X_train,y_train)
print('Coefficients: \n', lr.coef_)
In [249]:
lr.score(X_train, y_train, sample_weight=None)
Out[249]:
In [ ]:
# Maybe use a lasso or something to constrain the list of terms? With only 144 data points, 24 columns seems excessive