In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [4]:
# Okay!
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
# This enables inline Plots
%matplotlib inline
In [5]:
# the pd.read... etc pulls in data using pandas to create a data frame
bookings = pd.read_csv('../data/bookings.csv')
listings = pd.read_csv('../data/listings.csv')
# the following .head(5) function displays the header and first 5 line items in the data frame
# I used this to ensure the data frame was properly formatted
listings.head(5)
# bookings.head(5)
Out[5]:
In [6]:
# using the .describe function displays a bunch of information for each column
# The median is equvilant to the 50% quartile.
listings.describe()
# which is the media?
# what is this question asking?
# describe standard dev
# The standard deviation is a value that shows how spread out the numbers in a set are.
# For example, the pic count varies by 10.5 for each std dev.
# If the distribution was normal (which it is not), the std dev would show that 68.27% of values have pictures within 1 std. dev of 10.5 at a mean of 14.4
Out[6]:
In [7]:
# I used groupby to display the information in a table format
listings.groupby(['prop_type'])['person_capacity','picture_count','description_length','tenure_months','price'].agg(['mean'])
Out[7]:
In [8]:
# I added a variable to the groupby to show neighborhood.
# Depending on how you want to look at the data, it might be more intesting to look at property type first, then neighborhood.
# or neighborhood, then prop type. Both are provided.
# listings.groupby(['prop_type','neighborhood'])['person_capacity','picture_count','description_length','tenure_months','price'].agg(['mean'])
listings.groupby(['neighborhood','prop_type'])['person_capacity','picture_count','description_length','tenure_months','price'].agg(['mean'])
Out[8]:
In [9]:
print(type(bookings))
bookings.head(5)
bookings.sort_index(by='booking_date', ascending=False)
bookingCounts = bookings.groupby(['booking_date'])['prop_id'].agg(['count'])
# Note to self: create a dataframe in order to graph the histogram with labels
# need to make sure that both columns were/are labeled.
# bookingCounts = bookings['booking_date'].value_counts()
# print(type(bookingCounts))
# df = pd.DataFrame(bookingCounts)
# print(df)
# df.info()
print bookingCounts
propBookingCounts = bookings.groupby(['prop_id'])['booking_date'].agg(['count'])
print(propBookingCounts.info())
In [10]:
bookingCounts.hist()
# need to label axis
Out[10]:
In [11]:
# First step is to merge the two lists because one has info on listing dates
listMerge = listings.merge(bookings, on='prop_id')
listGroup = listMerge.groupby(['neighborhood','booking_date'])['prop_id'].agg(['count']).unstack(0)
listGroup.plot()
Out[11]:
In [12]:
listMerge.head()
Out[12]:
In [13]:
listGroup.head()
Out[13]:
In [14]:
# @Chad&Ramesh ... I don't understand how this is supposed to work.
# adding the columns is easy, but how are we supposed to iterate through and include the values?
listingsWithPropCount = listings.merge(propBookingCounts, left_on='prop_id', right_index=True)
listingsWithPropCount.rename(columns={'count': 'number_of_bookings'}, inplace=True)
# listings['booking_rate'] = listings.prop_id.map(booking_rate_map)
# listings['booking_rate'] = ""
# !!! things that don't work: !!!
# propBookingCounts.rename(columns={'count': 'number_of_bookings'}, inplace=True)
# propBookingCounts.ix[0:2, ['prop_id', 'number_of_bookings']]
# print propBookingCounts.head()
# listings['number_of_bookings'] = propBookingCounts.row_dt.map(lambda x: x.count)
# combiner = lambda x, y: np.where(isnull(x), y, x)
# listings.combine(propBookingCounts, combiner)
In [15]:
listingsWithPropCount.head()
Out[15]:
In [16]:
# def get_booking_rate(val):
# if number_of_bookings != 0:
# return number_of_bookings/listings['tenure_months']
# else:
# return 0
#of bookings/tenure_months
listingsWithPropCount['booking_rate'] = (listingsWithPropCount['number_of_bookings']/listingsWithPropCount['tenure_months'])
listingsWithPropCount.head()
Out[16]:
In [17]:
established_properties = listingsWithPropCount[(listings.tenure_months > 10)]
established_properties
Out[17]:
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [18]:
# is this something new? I cannot find references to this from lectures?
pd.get_dummies(established_properties)
Out[18]:
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [19]:
# @Chad/Ramesh -- its not clear to me what is supposed to be done here. Are we supposed to graph the data?
from sklearn.cross_validation import train_test_split
from IPython.core.pylabtools import figsize
from sklearn.linear_model import LinearRegression
In [20]:
x = established_properties[['price','person_capacity','picture_count','description_length','tenure_months']].values
y = established_properties['booking_rate'].values
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.8)
clf = LinearRegression()
clf.fit(x_train, y_train)
Out[20]:
In [21]:
ratePrediction = clf.predict(x_test)
sum_sq_model = np.sum((y_test - ratePrediction) ** 2)
sum_sq_model
Out[21]:
In [22]:
sum_sq_naive = np.sum((y_test - y.mean()) ** 2)
sum_sq_naive
Out[22]:
In [23]:
fig, ax = plt.subplots(1, 1)
ax.scatter(ratePrediction, y_test)
ax.set_xlabel('Predicated X')
ax.set_ylabel('Actual X')
# Draw the ideal line
ax.plot(y, y, 'r')
Out[23]:
In [1]:
# a, b = np.arange(10).reshape((5, 2)), range(5)
# a_train, a_test, b_train, b_test = train_test_split(a, b, test_size=0.33, random_state=42)
# print a_train
# print a_test
# print b_train
# print b_test
In [25]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
In [25]:
http://scikit- learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression.score
In [34]:
clf.score(x, y)
Out[34]:
...type here...
In [ ]: