In this assignment your challenge is to do some basic analysis for Airbnb. Provided in hw/data/ there are 2 data files, bookings.csv and listings.csv. The objective is to practice data munging and begin our exploration of regression.
In [483]:
# Standard imports for data analysis packages in Python
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image
# This enables inline Plots
%matplotlib inline
In [484]:
#Extracting Data from files and storing in dataframe
listings = pd.read_csv('../data/listings.csv')
bookings = pd.read_csv('../data/bookings.csv')
In [488]:
#Identifying the columns in the listings dataframe
listings.head()
Out[488]:
In [489]:
#Identifying the columns in the bookings dataframe
bookings.head()
Out[489]:
In [490]:
#Mean, Standard Deviation of price, person capacity, picture count, description length and tenure of the properties
listings.iloc[: , 3:8].describe()
Out[490]:
In [480]:
#Median of the above columns
listings.iloc[: , 3:8].median()
Out[480]:
In [491]:
# Mean Price, person capacity, picture count, description length, tenure months grouped by property type
listings.groupby(["prop_type"])["price","person_capacity","picture_count","description_length","tenure_months"].mean()
Out[491]:
In [492]:
listings.groupby(["prop_type","neighborhood"])["price","person_capacity","picture_count","description_length","tenure_months"].mean()
Out[492]:
In [590]:
#Line Plot using valu_counts.
bookings.booking_date.value_counts().plot(kind='line')
Out[590]:
In [494]:
#line plot using group by
bookings.groupby("booking_date")["booking_date"].count().plot(kind = 'line')
Out[494]:
In [495]:
#listmerge.groupby(["neighborhood","booking_date"])"prop_id"].agg(['count']).unstack(0)
In [496]:
#Grouping the bookings data in order to get the counts per property
nbookings = bookings.groupby("prop_id")[["prop_id"]].count()
#renaming the count column to number_of_bookings as required in the question
nbookings.rename(columns = {"booking_date" : "number_of_bookings"}, inplace = True)
#dropping redundant columns
nbookings.drop("prop_id", axis=1, inplace=True) # Axis 0 is rows, Axis 1 is columns
In [509]:
#nbookings
In [499]:
#indexing the listings to prop_id. In order to do that, creating a column to replicate the prop_id. Used this to join with same index in bookings
listings["prop_index"]=listings["prop_id"]
listings.set_index(["prop_index"], inplace=True)
In [507]:
#listings.prop_id
In [503]:
#joining bookings and listings on index. Here index is prop_id
listings = listings.join(nbookings)
In [510]:
#listings.head(100)
In [511]:
#adding new calculated column
listings["booking_rate"] = listings["number_of_bookings"]/listings["tenure_months"]
In [513]:
listings.head(1)
Out[513]:
In [473]:
#moving the well established properties to another dataframe
listings_we = listings[listings.tenure_months >= 10]
prop_type and neighborhood are categorical variables, use get_dummies() (http://pandas.pydata.org/pandas-docs/stable/generated/pandas.core.reshape.get_dummies.html) to transform this column of categorical data to many columns of boolean values (after applying this function correctly there should be 1 column for every prop_type and 1 column for every neighborhood category.
In [515]:
#seperating the prop_type into booloean values
dummies_prop_type = pd.get_dummies(listings_we["prop_type"])
In [516]:
#seperating the neighborhood into booloean values
dummies_neighborhood = pd.get_dummies(listings_we["neighborhood"])
In [589]:
dummies_neighborhood.info()
dummies_prop_type.info()
In [530]:
#joining the prop_type boolean columns to the main dataframe
listings_we = listings_we.join(dummies_prop_type)
In [531]:
#joining the neighborhood booloean values to the main dataframe
listings_we = listings_we.join(dummies_neighborhood)
In [533]:
listings_we.columns
#columns with null values
#booking_rate, number_of_bookings
Out[533]:
In [534]:
#number of rows with null values in booking_rate
len(listings_we[pd.isnull(listings_we.booking_rate)])
#len(listings_we)
#len(listings)
Out[534]:
In [535]:
#Since booking rate is going to be our predictor variable, let us not consider the rows with null(NaN)
#booking rate in our train/test sets
listings_we = listings_we[listings_we.booking_rate.notnull()]
predictor (y) is booking_rate, regressors (X) are everything else, except prop_id,booking_rate,prop_type,neighborhood and number_of_bookings
http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.train_test_split.html
http://pandas.pydata.org/pandas-docs/stable/basics.html#dropping-labels-from-an-axis
In [536]:
from sklearn.cross_validation import train_test_split
In [537]:
#features is X and target is y
features = listings_we[["price","person_capacity","picture_count","description_length","tenure_months","Property type 1","Property type 2", u'Property type 3', u'Neighborhood 11', u'Neighborhood 12', u'Neighborhood 13', u'Neighborhood 14', u'Neighborhood 15', u'Neighborhood 16', u'Neighborhood 17', u'Neighborhood 18', u'Neighborhood 19', u'Neighborhood 20', u'Neighborhood 21', u'Neighborhood 4', u'Neighborhood 5', u'Neighborhood 9']].values
target = listings_we["booking_rate"].values
In [538]:
#X.shape gives the rows,column count of X
features.shape
Out[538]:
In [539]:
#y.shape gives the target value count
target.shape
Out[539]:
In [540]:
#values in target
target
Out[540]:
In [562]:
#train test split
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.33)
In [563]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
In [564]:
lr.fit(features_train, target_train)
Out[564]:
In [565]:
#predicting y from X_test using the linear model created from train data
target_pred = lr.predict(features_test)
In [566]:
#square sum error
sum_sq_model = np.sum((target_test - target_pred) ** 2)
sum_sq_model
Out[566]:
In [567]:
lr.score(features_test,target_test)
#lr.score(features_train,target_train)
#features_train.shape
Out[567]:
In [568]:
#plotting original test Vs predicted
fig, ax = plt.subplots(1, 1)
ax.scatter(target_pred, target_test)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
# Draw the ideal line
ax.plot(target, target, 'r')
Out[568]:
The score method gives us the R-square value. It returns the R^2 of self.predict(X) wrt. y. Best possible score is 1.0, lower values are worse. Since the score in this case are low, this tells us that the model is not good
In [568]:
In [569]:
#space to understand which columns is correlating with booking_rate
from pandas.tools.plotting import scatter_matrix
#scat = scatter_matrix(listings_we[["price","person_capacity","picture_count","description_length","tenure_months","Property type 1","Property type 2", u'Property type 3', u'Neighborhood 11', u'Neighborhood 12', u'Neighborhood 13', u'Neighborhood 14', u'Neighborhood 15', u'Neighborhood 16', u'Neighborhood 17', u'Neighborhood 18', u'Neighborhood 19', u'Neighborhood 20', u'Neighborhood 21', u'Neighborhood 4', u'Neighborhood 5', u'Neighborhood 9']])
scat = scatter_matrix(listings_we[["booking_rate","person_capacity"]])
#,"person_capacity","picture_count","description_length","tenure_months","Property type 1","Property type 2", u'Property type 3', u'Neighborhood 11', u'Neighborhood 12', u'Neighborhood 13', u'Neighborhood 14', u'Neighborhood 15', u'Neighborhood 16', u'Neighborhood 17', u'Neighborhood 18', u'Neighborhood 19', u'Neighborhood 20', u'Neighborhood 21', u'Neighborhood 4', u'Neighborhood 5', u'Neighborhood 9']])
In [576]:
#Taking log of booking rate. Histogram looks more normally distributed than just the booking rate. See below two histograms
booking_rate_log = np.log(listings_we.booking_rate)
#listings_we.booking_rate.hist()
booking_rate_log.hist()
Out[576]:
In [571]:
#original boking rate
listings_we["booking_rate"].hist()
Out[571]:
In [585]:
#since taking the log gives negative results, shifting the data by 5 points.
listings_we["booking_rate_log"] = np.log(listings_we.booking_rate)+5
In [586]:
#below histogram (log + 5) is ore normal and on the positive axis
listings_we["booking_rate_log"].hist()
Out[586]:
In [587]:
#same steps with log
lr_log = LinearRegression()
features_log = listings_we[["price","person_capacity","picture_count","description_length","tenure_months","Property type 1","Property type 2", u'Property type 3', u'Neighborhood 11', u'Neighborhood 12', u'Neighborhood 13', u'Neighborhood 14', u'Neighborhood 15', u'Neighborhood 16', u'Neighborhood 17', u'Neighborhood 18', u'Neighborhood 19', u'Neighborhood 20', u'Neighborhood 21', u'Neighborhood 4', u'Neighborhood 5', u'Neighborhood 9']].values
target_log = listings_we["booking_rate_log"].values
features_log_train, features_log_test, target_log_train, target_log_test = train_test_split(features_log, target_log, test_size=0.33)
lr_log.fit(features_log_train, target_log_train)
target_log_pred = lr_log.predict(features_log_test)
sum_sq_model_log = np.sum((target_log_test - target_log_pred) ** 2)
print sum_sq_model_log
print lr_log.score(features_log_test,target_log_test)
In [588]:
#plotting graph same as above
fig, ax = plt.subplots(1, 1)
ax.scatter(target_log_pred, target_log_test)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
# Draw the ideal line
ax.plot(target, target, 'r')
Out[588]:
In [464]:
#space for trying other transformations
booking_rate_arcsin = np.(listings_we.booking_rate)
In [465]:
booking_rate_arcsin.hist()
Out[465]:
In [ ]: