In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [229]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# This enables inline Plots
%matplotlib inline
pd.set_option('display.max_rows', 10)
In [230]:
bookings = pd.read_csv('../data/bookings.csv', parse_dates = ['booking_date'])
listings = pd.read_csv('../data/listings.csv')
In [231]:
print bookings
In [232]:
print listings
In [233]:
bookings.info()
In [234]:
listings.info()
In [235]:
bookings.columns
Out[235]:
In [236]:
listings.columns
Out[236]:
In [237]:
# for col in ['neighborhood']:
# listings[col] = listings[col].map(lambda x: x.replace('Neighborhood ', '')).astype(int)
#for col in ['prop_type']:
# listings[col] = listings[col].map(lambda x: x.replace('Property type ', '')).astype(int)
#print listings
In [238]:
listings.info()
In [239]:
listings.price.std()
Out[239]:
In [240]:
# listings.groupby('prop_id').price.mean()
In [241]:
listings[['price', 'person_capacity', 'picture_count', 'description_length']].describe()
Out[241]:
In [242]:
fig, ax = plt.subplots(1,2)
listings.boxplot('price', ax=ax[0])
listings.price.plot(ax=ax[1])
ax[0].set_ylim([0, 500])
listings[listings.price > 1000]
Out[242]:
In [243]:
grouped = listings.groupby('prop_type')
grouped['price']
Out[243]:
In [244]:
# grouped['price'].isnull() == False].describe()
# grouped[grouped['price'].isnull() == False].describe()
In [245]:
grouped = listings.groupby('prop_type')
grouped[['price', 'person_capacity', 'picture_count', 'description_length']].agg([np.mean, np.std, np.median])
Out[245]:
In [246]:
grouped.prop_type.unique()
Out[246]:
In [247]:
naybor = listings.groupby(['prop_type', 'neighborhood'])['price', 'person_capacity', 'picture_count', 'description_length']
In [248]:
naybor.agg([np.mean, np.std, np.median])
Out[248]:
In [249]:
bookings.plot('booking_date', 'prop_id').fig_size = (50, 20)
plt.suptitle('Daily Bookings')
plt.xlabel('Booking Date')
plt.ylabel('# of Bookings')
Out[249]:
In [250]:
n_listings = listings.merge(bookings, on = 'prop_id', how = 'left')
In [251]:
n_listings.info()
In [252]:
#n_listings.booking_date = pd.to_datetime(n_listings.booking_date)
In [253]:
n_listings.booking_date.map(lambda x: x.dayofweek)
In [253]:
In [253]:
In [254]:
sns.factorplot("neighborhood", hue ="day_name", data = n_listings, palette = "Purples_d", size = 25, legend = True);
In [255]:
fig, ax = bookings.plot('booking_date', 'neighborhood').fig_size = (50, 20)
plt.suptitle('Daily Bookings')
ax.set_xlabel('Neighborhood')
ax.set_ylabel('# of Bookings')
ax.legend()
In [256]:
listings.info()
In [257]:
total_book = bookings.groupby('prop_id').count().reset_index()
total_book.rename(columns = {'booking_date' : 'number_of_bookings'}, inplace = True)
total_book
# listings['e'] = Series(np.random.randn(sLength), index=df1.index)
Out[257]:
In [258]:
listings = listings.merge(total_book, on = 'prop_id', how = 'left')
In [259]:
listings.number_of_bookings.fillna(0, inplace = True)
In [260]:
listings.head()
Out[260]:
In [261]:
listings['booking_rate'] = listings['number_of_bookings']/listings['tenure_months']
listings
Out[261]:
In [268]:
listings = listings[listings.tenure_months >= 10]
listings.info()
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [273]:
dummy_prop = pd.get_dummies(listings['prop_type'])
dummy_nay = pd.get_dummies(listings['neighborhood'])
listings_prop = listings.join([dummy_prop, dummy_nay])
In [299]:
listings_prop.booking_rate.fillna(0, inplace = True)
In [300]:
listings_prop.info()
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [301]:
from sklearn.cross_validation import train_test_split
In [302]:
merged_final = listings_prop.drop(['prop_id', 'prop_type', 'neighborhood', 'number_of_bookings'], axis = 1)
features = merged_final.drop(['booking_rate'], axis = 1).values
target = merged_final['booking_rate'].values
In [304]:
In [305]:
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.3)
In [306]:
target = np.log(target)
In [306]:
In [307]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
In [308]:
lr.fit(feature_train, target_train)
Out[308]:
In [ ]:
In [ ]:
...type here...
In [ ]: