In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [58]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# This enables inline Plots
%matplotlib inline
In [59]:
# Create a data frame from the listings dataset
listings = pd.read_csv('../data/listings.csv')
# Create a data frame from the bookings dataset
bookings = pd.read_csv('../data/bookings.csv')
In [82]:
listings[['price', 'person_capacity', 'picture_count', 'description_length', 'tenure_months']].describe()
Out[82]:
In [61]:
listings.info()
In [62]:
listings.groupby(['prop_type'])['price', 'person_capacity', 'picture_count', 'description_length', 'tenure_months'].agg(['mean'])
Out[62]:
In [63]:
listings.groupby(['prop_type', 'neighborhood'])['price', 'person_capacity', 'picture_count', 'description_length', 'tenure_months'].agg(['mean'])
Out[63]:
In [64]:
# Generate a count of bookings by date
cross_tab = bookings.groupby(['booking_date'])['prop_id'].agg(['count']).unstack(0)
# Plot the table
cross_tab.plot()
Out[64]:
In [65]:
# Create a smaller dataset with just the prop_id and neighborhood
neighborhoods = listings[['prop_id', 'neighborhood']]
# Merge bookings_by_prop into the listings data
bookings_neighborhoods = bookings.merge(neighborhoods, on='prop_id', how='left')
# Generate a count of bookings by date and neighborhood
cross_tab = bookings_neighborhoods.groupby(['neighborhood','booking_date'])['prop_id'].agg(['count']).unstack(0)
# Plot the table
cross_tab.plot()
Out[65]:
In [66]:
# Count the number of bookings by property
bookings_by_prop = bookings.groupby('prop_id')[['prop_id']].count()
bookings_by_prop
# Rename columns and reset the index
bookings_by_prop.rename(columns={'prop_id': 'number_of_bookings'}, inplace=True)
bookings_by_prop = bookings_by_prop.reset_index()
In [67]:
# Merge bookings_by_prop into the listings data
listings = listings.merge(bookings_by_prop, on='prop_id', how='left')
# Replace null values with 0's
listings.number_of_bookings.replace(np.nan, 0, inplace = True)
# Divide number_of_bookings by tenure_months
listings['booking_rate'] = listings.number_of_bookings / listings.tenure_months
# Describe the resulting dataset
listings.info()
In [68]:
listings_filtered = listings[listings.tenure_months > 10]
listings_filtered.info()
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [69]:
listings_filtered = pd.get_dummies(listings_filtered)
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [70]:
from sklearn.cross_validation import train_test_split
In [71]:
# Create a reduced data from of features from the original data frame
cols = list(listings_filtered.columns)
cols.remove('prop_id')
cols.remove('booking_rate')
cols.remove('number_of_bookings')
iv = listings_filtered[cols]
# Create a data frame with the target variable
dv = listings_filtered['booking_rate'].values
# Split those two data frames into training and test datasets
iv_train, iv_test, dv_train, dv_test = train_test_split(iv, dv, test_size=0.2)
In [72]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
In [73]:
lr.fit(iv_train, dv_train)
Out[73]:
In [74]:
lr.score(iv_train, dv_train)
Out[74]:
In [75]:
# The score tells us the amount of variance in the outcome variable that is explained by the predictor variables.
# A score of 0.41 tells us that the model explains about 40% of the variance in the outcome variable.
...type here...
In [76]:
# Create a reduced data from of features from the original data frame
listings_filtered['monthly_revenue'] = listings_filtered.price * listings_filtered.booking_rate
# Create a reduced data from of features from the original data frame
cols = list(listings_filtered.columns)
cols.remove('prop_id')
cols.remove('booking_rate')
cols.remove('number_of_bookings')
iv = listings_filtered[cols]
# Create a data frame with the target variable
dv = listings_filtered['booking_rate'].values
# Split those two data frames into training and test datasets
iv_train, iv_test, dv_train, dv_test = train_test_split(iv, dv, test_size=0.2)
# Fit model with the test sets
lr.fit(iv_train, dv_train)
# Report the score
lr.score(iv_train, dv_train)
Out[76]:
In [57]:
In [ ]: