In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [72]:
import pandas as pd
import numpy as np
import seaborn as sns # for pretty layout of plots
import matplotlib.pyplot as plt
In [73]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.precision', 2)
%matplotlib inline
In [310]:
bookingsdate = pd.read_csv('../data/bookings.csv',index_col='booking_date')
bookings = pd.read_csv('../data/bookings.csv')
listings = pd.read_csv('../data/listings.csv')
In [447]:
neighbor11['booking_date']=pd.to_datetime(bookings['booking_date'])
alldates['booking_date']=pd.to_datetime(alldates['booking_date'])
allbooking=pd.merge(neighbor11,alldates,on='booking_date')
neighbor11
Out[447]:
In [378]:
dates = pd.date_range('1/1/2011', periods=365)
alldates=pd.DataFrame(index=dates)
#alldatesbook=pd.concat([alldates,bookingsdate])
#alldatesbook
In [ ]:
In [76]:
listings
Out[76]:
In [76]:
In [274]:
listings.describe()
Out[274]:
In [275]:
# or alternatively...
listingstats=listings.groupby('prop_id').agg(['count', 'mean', 'median', 'min', 'max'])
listingstats[['price','person_capacity','picture_count','description_length','tenure_months']]
Out[275]:
In [276]:
typestats=listings.groupby('prop_type').agg(['count', 'mean', 'median', 'min', 'max'])
typestats[['price','person_capacity','picture_count','description_length','tenure_months']]
Out[276]:
In [16]:
neighborhoodstats=listings.groupby(['prop_type','neighborhood']).agg(['count', 'mean', 'median', 'min', 'max'])
neighborhoodstats[['price','person_capacity','picture_count','description_length','tenure_months']]
Out[16]:
In [551]:
#plotting timeseries of all bookings
dailystats=bookings.groupby(['booking_date']).agg('count')
dailystats.plot(figsize=(20,10))
Out[551]:
In [277]:
joineddf=bookings.merge(listings, on="prop_id")
joineddf
Out[277]:
In [928]:
neighborhoods=joineddf['neighborhood'].unique()
len(neighborhoods)
Out[928]:
In [491]:
bookingsneighbor['count']=joineddf.groupby(['neighborhood','booking_date'])['neighborhood'].agg('count')
neighbor11=pd.DataFrame(bookingsneighbor['Neighborhood 11'])
alldates['booking_date']=dates
neighbor11['booking_date'] = neighbor11.index
neighbor11['booking_date'] =neighbor11['booking_date'].astype(str)
type(neighbor11['booking_date'][2])
#pd.merge(neighbor11,alldates,on='booking_date')
bookingsneighborpd=pd.DataFrame(bookingsneighbor)
Out[491]:
In [946]:
color=['red','blue','green','yellow','orange','red','blue','green','yellow','orange','red','blue','green','yellow','orange','red','blue','green','yellow','orange']
In [948]:
#looking at other students plots I see I may have interpreted daily plot differently and you may have expected a bar plot with days of the week
#I'm having trouble generating a meaningful legend and a random color for each line but I'm pretty happy I figured out how to
# plot a sparse data series because initally I was plotting only the days wheres there was a booking instead of the entire year.
#however I see how this plot would not be particularly helpful.
import random
neighborhooddfs = []
for hood in neighborhoods:
neighborhooddfs.append(pd.DataFrame(bookingsneighbor[hood]))
alldatesneighborhooddfs=[]
for hooddf in neighborhooddfs:
hooddf['booking_date'] = hooddf.index
hooddf['booking_date'] = hooddf['booking_date'].astype(str)
alldatesneighborhooddfs.append(pd.merge(alldates,hooddf,on='booking_date',how='left'))
alldatesneighborhooddfs[1]=alldatesneighborhooddfs[1].fillna(0)
ax=alldatesneighborhooddfs[1].plot(by='booking_date', figsize = (20,10), legend=True)
for i in range(len(alldatesneighborhooddfs)):
alldatesneighborhooddfs[i]=alldatesneighborhooddfs[i].fillna(0)
for i in range(len(alldatesneighborhooddfs[1:])):
alldatesneighborhooddfs[i].fillna(0)
alldatesneighborhooddfs[i].plot(by='booking_date', figsize = (20,10), ax=ax)
#for i in range(len(neighborhoods)):
# pd.DataFrame(bookingsneighbor[i])
# list=["thing", "other thing", "third thing"]
# for thing in list:
# print thing
# for i in len(list):
# print list[i]
In [ ]:
seaborn.barplot
In [570]:
#didnt use this
neighbor14=pd.DataFrame(bookingsneighbor['Neighborhood 14'])
neighbor16=pd.DataFrame(bookingsneighbor['Neighborhood 16'])
neighbor13=pd.DataFrame(bookingsneighbor['Neighborhood 13'])
neighbor15=pd.DataFrame(bookingsneighbor['Neighborhood 15'])
neighbor18=pd.DataFrame(bookingsneighbor['Neighborhood 18'])
neighbor17=pd.DataFrame(bookingsneighbor['Neighborhood 17'])
neighbor12=pd.DataFrame(bookingsneighbor['Neighborhood 12'])
neighbor4=pd.DataFrame(bookingsneighbor['Neighborhood 4'])
neighbor19=pd.DataFrame(bookingsneighbor['Neighborhood 19'])
neighbor5=pd.DataFrame(bookingsneighbor['Neighborhood 5'])
neighbor20=pd.DataFrame(bookingsneighbor['Neighborhood 20'])
neighbor21=pd.DataFrame(bookingsneighbor['Neighborhood 21'])
neighbor9=pd.DataFrame(bookingsneighbor['Neighborhood 9'])
neighbor7=pd.DataFrame(bookingsneighbor['Neighborhood 7'])
neighbor8=pd.DataFrame(bookingsneighbor['Neighborhood 8'])
neighbor22=pd.DataFrame(bookingsneighbor['Neighborhood 22'])
neighbor3=pd.DataFrame(bookingsneighbor['Neighborhood 3'])
neighbor1=pd.DataFrame(bookingsneighbor['Neighborhood 1'])
neighbor10=pd.DataFrame(bookingsneighbor['Neighborhood 10'])
neighbor6=pd.DataFrame(bookingsneighbor['Neighborhood 6'])
#for i in range(len(bookingsneighbor['neighborhood'])):
# pd.DataFrame(bookingsneighbor[i])
In [643]:
# alldates['booking_date']= alldates['booking_date'].map(lambda datetime: str(alldates['booking_date'][datetime].split(" ")[0]))
# alldates
# # type(alldates['strtime'][2])
# allbooking=pd.merge(alldates,neighbor11,on='booking_date',how='left')
# allbooking=allbooking.fillna(0)
# allbooking.plot(figsize = (20,10),legend=True)
# # # type(alldates['strtime'][2])
# # datetime=pd.Timestamp('2011-01-06')
# # str(alldates['booking_date'][datetime])
In [644]:
# ax=bookingsneighbor['Neighborhood 1'].plot(figsize = (20,10), legend=True,xticks=None)
In [645]:
# bookingsneighbor=joineddf.groupby(['neighborhood','booking_date'])['neighborhood'].agg('count')
# ax=bookingsneighbor['Neighborhood 11'].plot(by='booking_date', figsize = (20,10), legend=True, ax=ax)
# #bookingsneighbor['Neighborhood 1'].plot(by='booking_date', figsize = (20,10), legend=True, ax=ax)
In [ ]:
In [163]:
##Part 2 - Develop a data set
Out[163]:
In [649]:
#adding columns for number of bookings and rate
numbook=joineddf.groupby(['prop_id']).agg('count')
listings['number_of_bookings']=numbook['booking_date']
listings['booking_rate']=listings['number_of_bookings']/listings['tenure_months']
listings=listings.fillna(0)
listings
Out[649]:
In [661]:
#filtering well established properties
established = listings[listings['tenure_months'] > 10]
established=established.fillna(0)
established
Out[661]:
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [662]:
#creating dummy variables for prop type
listings_dum_prop=pd.get_dummies(listings['prop_type'])
listings_dum_prop
Out[662]:
In [663]:
#creating dummy variables for neighborhood
listings_dum_neig=pd.get_dummies(listings['neighborhood'])
listings_dum_neig
Out[663]:
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [666]:
from sklearn.cross_validation import train_test_split
In [671]:
listings.columns
Out[671]:
In [ ]:
In [809]:
bookrate_train, bookrate_test, piccount_train, piccount_test, price_train, price_test, desclen_train, desclen_test, tenure_train, tenure_test = train_test_split(listings['booking_rate'],listings['picture_count'], listings['price'], listings['description_length'], listings['tenure_months'],test_size=0.33, random_state=42)
#bookrate_train, bookrate_test, price_train, price_test= train_test_split(listings['booking_rate'],listings['price'],test_size=0.25)
In [868]:
#creating my X training set for fitting
everything_train=[]
for everything in range(len(price_train)):
everything = [piccount_train[everything],price_train[everything], desclen_train[everything], tenure_train[everything]]
everything_train.append(everything)
In [882]:
#creating my X testing set
everything_test=[]
for everything in range(len(price_test)):
everything = [piccount_test[everything],price_test[everything], desclen_test[everything], tenure_test[everything]]
everything_test.append(everything)
In [837]:
# didnt use this
pictraining =[]
for pic in piccount_train:
pictraining.append([pic.astype(float)])
In [838]:
# didnt use this
tenuretraining =[]
for tenure in tenure_train:
tenuretraining.append([tenure.astype(float)])
In [839]:
# didnt use this
descriptraining =[]
for description in desclen_train:
descriptraining.append([description.astype(float)])
In [840]:
# didnt use this
price_train
pricetraining =[]
for price in price_train:
pricetraining.append([price.astype(float)])
In [927]:
#creating my Y training set for fitting
bookrate_train
booktraining =[]
for rate in bookrate_train:
booktraining.append([rate])
In [842]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.cross_validation import train_test_split
import matplotlib.pylab as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from IPython.core.pylabtools import figsize
figsize(5,5)
plt.style.use('fivethirtyeight')
In [835]:
def plot_approximation(est, ax, label=None):
"""Plot the approximation of ``est`` on axis ``ax``. """
#ax.plot(x_plot, f(x_plot), label='ground truth', color='green')
ax.scatter( pictraining, booktraining, label='training data', color='red')
ax.plot(x_plot, est.predict(x_plot[:, np.newaxis]), color='blue', label=label)
ax.set_ylim((0, 100))
ax.set_xlim((0, 65))
ax.set_ylabel('y')
ax.set_xlabel('x')
ax.legend(loc='upper right',frameon=True)
In [834]:
piccount_train.max()
Out[834]:
In [887]:
#below fitting my model with my train sets
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
degree = 1
est = make_pipeline(PolynomialFeatures(degree), LinearRegression())
est.fit(everything_train, booktraining)
Out[887]:
In [863]:
# sett=[pictraining,descriptraining,pricetraining,tenuretraining]
Out[863]:
In [865]:
#making some plots that I did not end up using but I wanted to retain this code
# for lst in sett:
# fig,ax = plt.subplots(1,1)
# degree = 1
# est = make_pipeline(PolynomialFeatures(degree), LinearRegression())
# est.fit(lst, booktraining)
# ax.scatter( lst, booktraining, label='training data', color='red')
# ax.plot(x_plot, est.predict(x_plot[:, np.newaxis]), color='blue',label='fit')
# ax.set_ylim((0, 55))
# ax.set_xlim((0, max(map(max,lst))))
# ax.set_ylabel('y')
# ax.set_xlabel('x')
# ax.legend(loc='upper right',frameon=True)
In [898]:
#creating my prediction from my X test set
prediction=est.predict(everything_test)
#plotting this prediction against my Y test set, if this was a good model I would expect a line
fig,ax = plt.subplots(1,1)
ax.scatter( prediction, bookrate_test, label='prediction', color='red')
Out[898]:
In [889]:
#reporting my score, this model isn't very good
score = est.score(everything_test,bookrate_test)
score
Out[889]:
I think the score method is calculating the r sqaured value and an r sqaured of 0.12 is telling me the model is not particularly good at predicting booking rate from the parameters we fed it.
In [ ]:
In [ ]:
In [ ]:
In [ ]: