In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [1]:
!ls ../data
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint # for pretty printing
# This enables inline Plots
%matplotlib inline
# Limit rows displayed in notebook
pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 2)
In [2]:
listings = pd.read_csv('../data/listings.csv')
listings
Out[2]:
In [3]:
bookings = pd.read_csv('../data/bookings.csv', parse_dates=['booking_date'])
bookings
Out[3]:
In [4]:
listings.describe()
Out[4]:
In [5]:
pd.set_option('display.max_rows', 30)
listings.groupby('prop_type').describe()
Out[5]:
In [6]:
pd.set_option('display.max_rows', 40)
listings.groupby(['neighborhood','prop_type']).agg('mean')
Out[6]:
In [7]:
dailybookings = bookings
dailybookings['day']=dailybookings.booking_date.map(lambda dt: dt.strftime("%Y-%m-%d"))
dailybookings['month']=dailybookings.booking_date.map(lambda dt: dt.strftime("%Y-%m"))
dailybookings.head(2)
Out[7]:
In [8]:
dailytotals = dailybookings.groupby('day').count()
dailytotals.prop_id.plot()
Out[8]:
In [9]:
#more simply
bookings.booking_date.value_counts().plot()
Out[9]:
In [10]:
dailylistings = pd.merge(dailybookings, listings, on='prop_id')
dailylistings.head(2)
listingtotals =dailylistings.groupby(['day','neighborhood']).count()
listingtotals.head(2)
Out[10]:
In [11]:
bookingPivot = pd.pivot_table(dailylistings, values='prop_id', index='booking_date',
columns='neighborhood', aggfunc='count')
bookingPivot.head(2)
#?bookingPivot.columns
#listingtotals.head(2)
Out[11]:
In [12]:
plt.figure(); bookingPivot.plot(figsize=(20,10)); plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
Out[12]:
In [13]:
monthlyBookings = pd.DataFrame(dailybookings,columns= ['month','prop_id'])
#monthlyMerge = pd.merge(monthlyTotals, listings, on='prop_id')
monthlyBookings['number_of_bookings'] = 1
MonthTotals = monthlyBookings.groupby(['month','prop_id'], as_index=False).agg('sum')
monthlyMerge = pd.merge(MonthTotals, listings, on='prop_id')
monthlyMerge['booking_rate'] = (monthlyMerge.number_of_bookings +0.0) / monthlyMerge.tenure_months
monthlyMerge.describe()
Out[13]:
In [ ]:
In [14]:
monthlyFilter = monthlyMerge[monthlyMerge.tenure_months > 9]
monthlyFilter.describe()
Out[14]:
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [15]:
#pd.get_dummies(monthlyFilter, prefix=['prop_type'])
for x in ['prop_type','neighborhood']:
just_dummies = pd.get_dummies(monthlyFilter[x])
step_1 = pd.concat([monthlyFilter, just_dummies], axis=1)
step_1.drop([x], inplace=True, axis=1)
monthlyFilter = step_1
monthlyFilter.info()
In [17]:
booking_rate = monthlyFilter.booking_rate
booking_rate
Out[17]:
In [18]:
killme = ['prop_id','booking_rate','month','number_of_bookings']
#killme = ['month']
monthlyFilter.drop(killme, inplace=True, axis=1)
monthlyFilter.info()
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [19]:
from sklearn.cross_validation import train_test_split
In [20]:
#a_train, a_test = train_test_split(monthlyFilter,test_size=0.4)
X_train, X_test, y_train, y_test = train_test_split(monthlyFilter, booking_rate, test_size=0.4)
#a_train.shape
In [21]:
print y_test.shape, X_test.shape
In [22]:
from sklearn.linear_model import LinearRegression
#lr = LinearRegression()
#X_train = a_train[:,range(1,23)]
#X_train.shape
In [141]:
#Y_train = a_train[:,0]
#Y_train.shape
Out[141]:
In [23]:
lr = LinearRegression()
In [25]:
lr.fit(X_train, y_train)
Out[25]:
In [146]:
#X_test = a_test[:,range(1,23)]
#X_test.shape
Out[146]:
In [150]:
#Y_test = a_test[:,0]
#Y_test.shape
Out[150]:
In [26]:
lr.score(X_test, y_test)
Out[26]:
In [40]:
?lr.score
In [ ]:
This looks it is not very correlated at .2 it would be better correlated if it were closer 1 1. We need to find better factors
In [38]:
#I am running late here.
#what I would want to do and don't have time is to make a measure to determine how long something has been open and
#the see the growth
import datetime
def addmonths(date,months):
targetmonth=months+date.month
try:
date.replace(year=date.year+int(targetmonth/12),month=(targetmonth%12))
except:
# There is an exception if the day of the month we're in does not exist in the target month
# Go to the FIRST of the month AFTER, then go back one day.
date.replace(year=date.year+int((targetmonth+1)/12),month=((targetmonth+1)%12),day=1)
date+=datetime.timedelta(days=-1)
monthlyMerge = pd.merge(MonthTotals, listings, on='prop_id')
monthlyMerge['booking_rate'] = (monthlyMerge.number_of_bookings +0.0) / monthlyMerge.tenure_months
monthlyMerge.month.max
monthlyMerge['month_time'] = pd.to_datetime(monthlyMerge.month)
from datetime import timedelta
monthlyMerge['startTime'] = monthlyMerge.tenure_months.map(lambda dt: addmonths(pd.to_datetime("2011-12-01"),-dt))
monthlyMerge.startTime
Out[38]:
In [36]:
?timedelta
In [30]:
#I have run out of time to make this work...
Out[30]:
In [32]:
check = foo["price"]
In [35]:
check
Out[35]:
In [ ]: