In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [1]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns # for pretty layout of plots
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import itertools
import sklearn as sk
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
# This enables inline Plots
%matplotlib inline
# Limit rows displayed in notebook
pd.set_option('display.max_rows', 20)
pd.set_option('display.precision', 4)
In [2]:
listings = pd.read_csv('../data/listings.csv')
bookings = pd.read_csv('../data/bookings.csv',parse_dates = True)
bookings['booking_date'] = pd.to_datetime(bookings.booking_date)
In [3]:
print listings.info()
print bookings.info()
In [4]:
listings.describe()
Out[4]:
In [5]:
listings.groupby(['prop_type'])['price', 'person_capacity', 'picture_count', 'description_length', 'tenure_months'].agg(['mean'])
Out[5]:
In [6]:
listings.groupby(['prop_type','neighborhood',])['price', 'person_capacity', 'picture_count', 'description_length', 'tenure_months'].mean().sort(['price'], ascending = False)
Out[6]:
In [7]:
bookings.booking_date.value_counts().plot(figsize = (20,10))
Out[7]:
In [8]:
bookings_neighborhood = pd.merge(listings, bookings, on='prop_id')
bookings_neighborhood = bookings_neighborhood.groupby(['neighborhood','booking_date'])['neighborhood'].agg(['count']).unstack(0)
colors = itertools.cycle(['b', 'g', 'r', 'c', 'm', 'y'])
markers = itertools.cycle(mlines.Line2D.filled_markers)
fig, ax = plt.subplots(1,1)
fig.text(.5,.95, "Daily Bookings by Neighborhood", fontsize=20, ha='center')
fig.set_figwidth(30)
fig.set_figheight(15)
for neighborhoods in bookings_neighborhood:
y = bookings_neighborhood[bookings_neighborhood[neighborhoods].notnull()][[neighborhoods]]
x = y.index
marker = markers.next()
color = colors.next()
ax.scatter(x,y, marker= marker, s=50,c=color, label= neighborhoods)
ax.legend(loc='upper right');
ax.set_ylabel('Bookings');
In [9]:
bookings_neighborhood
Out[9]:
In [10]:
df = bookings.groupby('prop_id').count().reset_index()
df = pd.merge(listings, df, on='prop_id')
df.rename(columns={'booking_date': 'number_of_bookings'}, inplace=True)
df['booking_rate']= df.number_of_bookings/df.tenure_months
In [11]:
df = df[df['tenure_months']>9]
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [12]:
df = pd.core.reshape.get_dummies(df, ['prop_type','neighborhood'])
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [13]:
y = df.booking_rate.values
X = df.drop(['prop_id','booking_rate','number_of_bookings', 'tenure_months'],axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)
In [14]:
lr = LinearRegression()
degree = 1
est = make_pipeline(PolynomialFeatures(degree),LinearRegression(normalize=True))
In [15]:
regr = est.fit(X_train,y_train)
In [16]:
np.shape(y_test)
Out[16]:
In [17]:
regr.score(X_test, y_test)
Out[17]:
In [18]:
test_predict = regr.predict(X_test)
train_predict = regr.predict(X_train)
fig, ax = plt.subplots(1,1)
ax.scatter(train_predict, y_train, c='r')
ax.scatter(test_predict, y_test)
ax.set_xlabel('Predicted Booking Rate');
ax.set_ylabel('Actual Booking Rate');
Returns the coefficient of determination R^2 of the prediction.
The coefficient R^2 is defined as (1 - u/v), where u is the regression sum of squares ((y_true - y_pred) 2).sum() and v is the residual sum of squares ((y_true - y_true.mean()) 2).sum(). Best possible score is 1.0, lower values are worse.
R^2 shows how well the data fits the model
In [19]:
df['monthly_revenue'] = df.number_of_bookings*df.price/df.tenure_months
In [20]:
y = df.monthly_revenue.values
X = df.drop(['prop_id','booking_rate','number_of_bookings', 'tenure_months','monthly_revenue','price'],axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
In [21]:
regr = est.fit(X_train, y_train)
regr.score(X_test, y_test)
Out[21]:
In [22]:
test_predict = regr.predict(X_test)
In [23]:
sum_sq_naive = np.sum((test_predict - y_test.mean())**2)
sum_sq_naive
Out[23]:
In [24]:
test_predict = regr.predict(X_test)
train_predict = regr.predict(X_train)
fig, ax = plt.subplots(1,1)
ax.scatter(train_predict, y_train, c='r')
ax.scatter(test_predict, y_test)
ax.set_xlabel('Predicted Monthly Revenue');
ax.set_ylabel('Actual Monthly Revenue');
In [ ]: