In [1]:
import pandas as pd
import matplotlib as plt
# draw plots in notebook
%matplotlib inline
In [2]:
# make plots SVG (higher quality)
%config InlineBackend.figure_format = 'svg'
In [3]:
# more time/compute intensive to parse dates. but we know we definitely have/need them
df = pd.read_csv('data/sf_listings.csv', parse_dates=['last_review'], infer_datetime_format=True)
df_reviews = pd.read_csv('data/reviews.csv', parse_dates=['date'], infer_datetime_format=True)
In [4]:
df_reviews.date[0]
Out[4]:
In [5]:
df.head()
Out[5]:
In [6]:
# display general diagnostic info
df.info()
In [7]:
df_reviews.head()
Out[7]:
In [8]:
# index DataFrame on listing_id in order to join datasets
reindexed_df = df_reviews.set_index('listing_id')
reindexed_df.head()
Out[8]:
In [9]:
# remember the original id in a column to group on
df['listing_id'] = df['id']
df_listing = df.set_index('id')
df_listing.head()
Out[9]:
In [10]:
# join the listing information with the review information
review_timeseries = df_listing.join(reindexed_df)
print review_timeseries.columns
review_timeseries.head()
Out[10]:
In [11]:
# nothing new/interesting here...
review_timeseries.groupby('listing_id').count()['name'].hist(bins=100, figsize=(12,6));
In [12]:
# causes python to crash, lets see if there is a better way
# review_timeseries.groupby(['neighbourhood','date']).count()
In [13]:
# lets try a pivot table...
reviews_over_time = pd.crosstab(review_timeseries.date, review_timeseries.neighbourhood)
reviews_over_time.head()
Out[13]:
In [14]:
# let's look at some particular neighborhoods
neighborhoods = df.neighbourhood.unique()
print neighborhoods
In [16]:
# a little noisy
reviews_over_time[['Mission', 'South of Market', 'Noe Valley']].plot(figsize=(12,6))
Out[16]:
In [17]:
# smooth by resampling by month
reviews_over_time.resample('M').mean()[['Mission', 'South of Market', 'Noe Valley']].plot(figsize=(12,6))
Out[17]:
Pandas Resample String convention
Using the following functions, find which columns correlate with increased activity (# of reviews and reviews per month):
pandas correlation function
In [ ]:
# Exercise 1 Solution
Actually pretty nearly similar mathematically...
At least inferentially: http://stats.stackexchange.com/questions/2125/whats-the-difference-between-correlation-and-simple-linear-regression
In [18]:
from sklearn import linear_model
In [31]:
features = df[['host_name', 'neighbourhood', 'room_type', 'minimum_nights','number_of_reviews', \
'calculated_host_listings_count', 'availability_365']]
labels = df['price']
In [20]:
# no price!
features.head()
Out[20]:
In [21]:
# Categorical -> One Hot Encoding
# http://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-categorical-features
dummies = pd.get_dummies(features)
# sklearn likes matrices
feature_matrix = dummies.as_matrix()
In [22]:
labels.as_matrix()
Out[22]:
In [23]:
feature_matrix
Out[23]:
In [24]:
# Initialize and Fit sklearn model
model = linear_model.LinearRegression()
clf = model.fit(feature_matrix, labels.as_matrix())
In [25]:
# How well did we do?
clf.score(feature_matrix, labels.as_matrix())
Out[25]:
In [27]:
print "There are {0} features...".format(len(clf.coef_))
clf.coef_
Out[27]:
In [32]:
# Remove the name column, we are probably overfitting...
no_name = features.copy()
no_name.pop('host_name')
no_names_feature_m = pd.get_dummies(no_name).as_matrix()
In [33]:
model = linear_model.LinearRegression(normalize=True)
clf = model.fit(no_names_feature_m, labels.as_matrix())
In [34]:
# Turns out the name feature is highly predictive...
# but not very useful: https://www.kaggle.com/wiki/Leakage
clf.score(no_names_feature_m, labels.as_matrix())
Out[34]:
In [35]:
len(clf.coef_)
Out[35]:
In [37]:
# We need more and better features
df2 = pd.read_csv('data/listings_full.csv')
df2.columns
Out[37]:
In [38]:
df2.head()
Out[38]:
In [39]:
# get a snapshot of some of the columns in the center of the matrix
df2.iloc[1:5, 40:60]
Out[39]:
In [40]:
# optimistically lets just use a few key features to start. Remember Occam's razor..
select_features = df2[['host_has_profile_pic' ,'host_identity_verified', 'host_listings_count','host_response_time', 'host_acceptance_rate', 'host_is_superhost', 'transit', 'neighbourhood_cleansed','is_location_exact', 'property_type', 'room_type', 'accommodates','bathrooms','bedrooms','beds']]
In [41]:
select_features.head()
Out[41]:
In [43]:
# moar feature engineering. fill in missing data since it wil break our model
select_features = select_features.fillna({'host_response_time': 'NA', 'host_acceptance_rate': '-1%'})
select_features.info()
In [44]:
# convert the percentage as a string into a float
select_features.host_acceptance_rate = select_features.host_acceptance_rate.str.strip('%').astype(float) / 100
In [45]:
# Binarize transit column... the listing is either near transit or it isn't
select_features.transit = select_features.transit.isnull()
In [46]:
select_features.transit
Out[46]:
In [47]:
# One last fill incase we missed any nulls
dummies = pd.get_dummies(select_features).fillna(0)
feature_matrix = dummies.as_matrix()
In [48]:
# Price as a currency string -> price as a float
labels = df2.price.str.strip('$').str.replace(',', '').astype(float)
In [49]:
# initialize model again
model = linear_model.LinearRegression(normalize=True)
clf = model.fit(feature_matrix, labels)
In [50]:
# much better!
clf.score(feature_matrix, labels)
Out[50]:
In [52]:
# a sweet spot in between over and under fitting
len(clf.coef_)
Out[52]:
In [53]:
# Predict what we should price listing #1000 at given its features
clf.predict(feature_matrix[1100])
Out[53]:
In [55]:
# Looks like it is overpriced...
df2.iloc[1100].price
Out[55]:
In [56]:
# And it shows... there are only 2 reviews per month
df2.iloc[1100]
Out[56]:
In [57]:
# Where the top listing have 10+ reviews per month
df2.sort_values('reviews_per_month', ascending=False).head()
Out[57]:
In [58]:
# Zip together our column names with our beta coefficients
coefficients = zip(dummies.columns, clf.coef_)
In [59]:
# Most significant
sorted(coefficients, key=lambda coef: coef[1], reverse=True)[:10]
Out[59]:
In [60]:
# Least significant
sorted(coefficients, key=lambda coef: coef[1])[:10]
Out[60]:
In [ ]: