In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [2]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns # for pretty layout of plots
import matplotlib.pyplot as plt
# This enables inline Plots
%matplotlib inline
In [3]:
pd.__version__
Out[3]:
In [4]:
# Let's explore the Datasets
bookings = pd.read_csv('../data/bookings.csv', parse_dates=['booking_date'])
listings = pd.read_csv('../data/listings.csv')
In [5]:
bookings.tail()
Out[5]:
In [6]:
bookings.set_index('booking_date' ,inplace=True)
In [7]:
bookings['number_of_bookings'] = 1
In [8]:
help(bookings.resample)
In [9]:
bookings.resample('M', how='count').number_of_bookings.plot()
Out[9]:
In [10]:
bookings.resample('D')
Out[10]:
In [11]:
listings.tail()
Out[11]:
In [12]:
listings.describe()
Out[12]:
In [13]:
len([1, 2, 3])
Out[13]:
In [14]:
[1, 2, 3].__len__()
Out[14]:
In [15]:
class MyClass(object):
def __init__(self, value):
self.value = value
def __len__(self):
return self.value
In [16]:
myobj = MyClass(5)
In [17]:
listings.groupby(['prop_type']).agg(['mean', 'count'])
Out[17]:
In [18]:
listings.head(2)
Out[18]:
In [19]:
pd.pivot_table(listings, values='person_capacity', index='neighborhood', columns='prop_type').head(2)
Out[19]:
In [20]:
group_cols = ['neighborhood', 'prop_type']
agg_cols = ['person_capacity', 'price']
listings.groupby(group_cols)[agg_cols].agg(['sum', 'count']).unstack(level='prop_type')
#listings.groupby(group_cols)[agg_cols].agg(['sum', 'count']).unstack(1)
Out[20]:
In [21]:
listings.groupby(['prop_type', 'neighborhood']).agg(['mean', 'count'])
Out[21]:
In [34]:
bookings = bookings.reset_index()
bookings.groupby('booking_date').count()
Out[34]:
In [35]:
# Plot daily bookings
#grid_plot = sns.FacetGrid(bookings, row='booking_date', col='prop_id')
#grid_plot.map(sns.regplot, 'booking_date', color='.3', fit_reg=False, x_jitter=.1)
#prop_id booking_date
#ax = sns.boxplot(bookings.age)
#ax.set_title('Age Distribution by class')
#bookings.groupby(['booking_date']).agg(['count']).plot(kind='bar')
#bookings.groupby('booking_date').agg(['count']).plot(kind='bar')
bookings.groupby('booking_date').count().plot(kind='bar')
Out[35]:
In [36]:
bookings.head()
Out[36]:
In [40]:
#first merge
merged_bookings_listings = pd.merge(bookings, listings, on='prop_id')
merged_bookings_listings.head()
merged_bookings_listings.groupby('neighborhood')['neighborhood'].count().plot(kind='bar')
#groupby('cylinders')['mpg'].count().plot(kind='bar')
#bookings.resample('M', how='count').number_of_bookings.plot()
Out[40]:
In [ ]:
In [41]:
b =bookings.groupby('prop_id').count()
In [42]:
b.head()
Out[42]:
In [43]:
c = b.reset_index()
In [44]:
c.head()
Out[44]:
In [54]:
number_of_bookings = bookings.groupby('prop_id').count().reset_index()
number_of_bookings.rename(columns={'booking_date':'number_of_bookings2'}, inplace = True)
number_of_bookings.head()
listings2 = pd.merge(listings, number_of_bookings, how='left', on='prop_id')
listings2.number_of_bookings.fillna(0, inplace=True)
listings2.head()
Out[54]:
In [55]:
#listings['number_of_bookings'] = 1
#listings['booking_rate'] =
number_of_bookings = bookings.groupby('prop_id').count().reset_index()
number_of_bookings.rename(columns={'booking_date':'booking_date_old'}, inplace = True)
listings2 = pd.merge(listings, number_of_bookings, how='left', on='prop_id')
listings2.number_of_bookings.fillna(0, inplace=True)
# Alternative way: listings2['number_of_bookings'] = listings2.number_of_bookings.fillna(0)
listings2['booking_rate'] = listings2.number_of_bookings/listings2.tenure_months
listings2.tail()
Out[55]:
In [57]:
listings2[listings2.tenure_months > 9].tail()
Out[57]:
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [70]:
#this converted the strings to multiple collums, each unique to the possible values in the original collumn. Note this only applied to the strings. The integers did not change.
listings3 = pd.get_dummies(listings2)
#pd.get_dummies(listings2[['prop_type', 'neighborhood', 'price', 'person_capacity', 'picture_count', 'description_length', 'tenure_months', 'booking_rate']])
In [72]:
listings3.columns
Out[72]:
In [115]:
listings3.head()
Out[115]:
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [74]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(listings3[['price', 'person_capacity', 'picture_count', 'description_length', 'tenure_months']
], listings3.booking_rate, random_state=12, test_size=0.2)
In [ ]:
In [93]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
In [94]:
lr.fit(X_train, y_train)
Out[94]:
In [95]:
lr.score(X_test, y_test)
Out[95]:
The score returns the coefficient of determination R^2 of the prediction. Our "score" was 0.1256213709762638, which seems pretty low (the best is 1). Therefore, this suggests the X_test data does not strongly predict Y_test data.
In [120]:
X_train, X_test, y_train, y_test = train_test_split(listings3[['price', 'picture_count', 'tenure_months']
], listings3.booking_rate, random_state=12, test_size=0.2)
lr = LinearRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)
#I tried adding and removing additional fields, and nothing seemed to significantly increasee the coefficient.
#Not sure how to create monthly revenue.
Out[120]:
In [ ]:
In [121]:
#Is it possible to plot the final regression in the homework?
#What does “random_state=12” mean??
#How create “monthly revenue” predictor in HW1?
#How could I identify which of the x_test inputs are the most important to determine y?
In [ ]:
In [107]:
##Optional - can we plot this info?##
In [106]:
from sklearn.preprocessing import PolynomialFeatures
def f(x):
return np.sin(2 * np.pi * x)
# generate points used to plot
x_plot = np.linspace(0, 1, 100)
def plot_approximation(est, ax, label=None):
"""Plot the approximation of ``est`` on axis ``ax``. """
ax.plot(x_plot, f(x_plot), label='ground truth', color='green')
ax.scatter(X_train, y_train)
ax.plot(x_plot, est.predict(x_plot[:, np.newaxis]), color='red', label=label)
ax.set_ylim((-2, 2))
ax.set_xlim((0, 1))
ax.set_ylabel('y')
ax.set_xlabel('x')
ax.legend(loc='upper right',frameon=True)
In [101]:
fig,ax = plt.subplots(1,1)
degree = 1
lr = make_pipeline(PolynomialFeatures(degree), LinearRegression())
plot_approximation(lr, ax, label='1')