In [331]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
In [332]:
pd.set_option('display.max_row', None)
pd.set_option('display.precision', 3)
pd.set_option('display.max_column', None)
In [333]:
bookings = pd.read_csv('../data/bookings.csv', parse_dates = ['booking_date'])
bookings.info()
In [334]:
listings = pd.read_csv('../data/listings.csv')
In [335]:
merged = listings.merge(bookings)
merged.info()
In [336]:
listings[['price', 'person_capacity',
'picture_count', 'description_length',
'tenure_months']].describe().loc[['mean', '50%', 'std']]
Out[336]:
In [337]:
listings.groupby('prop_type')['price',
'person_capacity', 'picture_count',
'description_length', 'tenure_months'].mean()
Out[337]:
In [338]:
listings.groupby(['neighborhood', 'prop_type'])['price',
'person_capacity', 'picture_count',
'description_length', 'tenure_months'].mean()
Out[338]:
In [339]:
merged.info()
In [340]:
merged.booking_date = pd.to_datetime(merged.booking_date)
In [341]:
merged.info()
In [342]:
merged['day_of_week'] = merged.booking_date.map(lambda x: x.dayofweek)
In [343]:
def day(x):
day_of_week = {}
day_of_week[0] = 'Mon'
day_of_week[1] = 'Tues'
day_of_week[2] = 'Wed'
day_of_week[3] = 'Thurs'
day_of_week[4] = 'Fri'
day_of_week[5] = 'Sat'
day_of_week[6] = 'Sun'
return day_of_week[x]
merged['day_name'] = merged['day_of_week'].map(day)
In [344]:
merged.groupby(['day_of_week'])['day_name'].value_counts().plot(kind = 'bar')
Out[344]:
In [345]:
sns.factorplot("neighborhood", hue="day_name", data=merged, palette="Purples_d", size = 25, legend = True);
In [345]:
In [395]:
bookings_ct = bookings.groupby('prop_id').count().reset_index()
merged2 = listings.merge(bookings_ct, on = 'prop_id')
merged2.head()
Out[395]:
In [396]:
merged2.rename(columns = {'booking_date' : 'number_of_bookings'}, inplace = True)
merged2['booking_rate'] = merged2.number_of_bookings/merged2.tenure_months
merged2.info()
In [397]:
merged2 = merged2[merged2.tenure_months > 9]
merged2.info()
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [398]:
dummy_prop = pd.get_dummies(merged2['prop_type'])
dummy_neigh = pd.get_dummies(merged2['neighborhood'])
merged2_prop = merged2.join(dummy_prop)
In [399]:
merged3 = merged2_prop.join(dummy_neigh)
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [400]:
from sklearn.cross_validation import train_test_split
In [401]:
merged_final = merged3.drop(['prop_id', 'prop_type', 'neighborhood', 'number_of_bookings'], axis = 1)
features = merged_final.drop(['booking_rate'], axis = 1).values
target = merged_final['booking_rate'].values
In [401]:
In [402]:
features.shape
Out[402]:
In [403]:
target = np.log(target)
In [404]:
feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size = .2, random_state = (9))
In [419]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
In [420]:
lr.fit(feature_train, target_train )
Out[420]:
In [421]:
target_pred = lr.predict(feature_test)
In [422]:
# target_pred[target_pred == target_pred.min()] = target_pred.mean()
target_pred[target_pred < -4] = target_pred.mean()
target_pred
Out[422]:
In [423]:
sum_sq_model = np.sum((target_test - target_pred) ** 2)
sum_sq_model
Out[423]:
In [424]:
sum_sq_naive = np.sum((target_test - target_test.mean()) ** 2)
sum_sq_naive
Out[424]:
In [425]:
1 - sum_sq_model / sum_sq_naive
Out[425]:
In [426]:
lr.score(feature_test, target_test, sample_weight = None)
Out[426]:
In [388]:
fig, ax = plt.subplots(1,1)
ax.scatter(target_pred, target_test)
ax.plot(target, target, 'r')
Out[388]:
The score method is an indicator of how well the model is performing. It takes the squared difference between the target test y value and the predicted 'y' value. It then takes 1 and subtracts the squared difference. If value is 1 it means that the squared difference is 0, which would reflex prefectly predicted 'y' values. If the value is below 0 it means that the predicted values are worse than the just placing the mean of 'y' for every predicted value.
The model comes up with a generally low or negative number, varying each time because train_test_split takes a different sample each time it runs. That means that the prediction is generally slightly better or slightly worse than the mean depending on the target_test and feature_test sample.
In [191]:
merged_final['monthly_rev'] = merged_final.booking_rate * merged_final.price
In [192]:
merged_final.monthly_rev.hist(bins = 50);
In [193]:
features1 = merged_final.drop(['booking_rate', 'monthly_rev'], axis = 1).values
In [194]:
target1 = merged_final['monthly_rev']
In [195]:
target1 = np.log(target1)
In [196]:
features_train1, features_test1, target_train1, target_test1 = train_test_split(features1, target1, test_size = .2)
In [197]:
lr.fit(features_train1, target_train1)
Out[197]:
In [198]:
features_train1
Out[198]:
In [199]:
target_pred1 = lr.predict(features_test1)
In [199]:
In [200]:
sum_sq_model = np.sum((target_test1 - target_pred1) ** 2)
sum_sq_model
Out[200]:
In [201]:
sum_sq_naive = np.sum((target_test1 - target_test1.mean()) ** 2)
sum_sq_naive
Out[201]:
In [202]:
1 - sum_sq_model / sum_sq_naive
Out[202]:
In [203]:
lr.score(features_test1, target_test1, sample_weight = None)
Out[203]:
In [204]:
fig, ax = plt.subplots(1,1)
ax.scatter(target_pred1, target_test1)
ax.plot(target1, target1, 'r')
Out[204]:
In [43]:
In [43]:
In [ ]: