In [1]:
import sys
import pandas as pd
import matplotlib as ml
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
print("Python version: ", sys.version)
print("Pandas version: ", pd.__version__)
print("Matplotlib version: ", ml.__version__)
First we read in the data
In [3]:
url1 = "http://data.insideairbnb.com/united-states/"
url2 = "ny/new-york-city/2016-02-02/data/listings.csv.gz"
full_df = pd.read_csv(url1+url2, compression="gzip")
full_df[["id", "last_scraped", "name", "description", "number_of_reviews", "price", "review_scores_rating"]].head(3)
Out[3]:
We don't want all data, so let's focus on a few variables.
In [4]:
full_df.dtypes
Out[4]:
In [5]:
full_df.columns
Out[5]:
In [6]:
df = full_df[["id", "price", "number_of_reviews", "review_scores_rating", "bedrooms", "city", "neighbourhood"]]
df.tail(10)
Out[6]:
In [7]:
print(df["id"].nunique())
print(df["id"].shape)
In [8]:
df.dtypes
Out[8]:
Need to convert prices to floats
In [9]:
df.replace({'price': {'\$': ''}}, regex=True, inplace=True)
df.replace({'price': {'\,': ''}}, regex=True, inplace=True)
df['price'] = df['price'].astype('float64', copy=False)
In [10]:
df.dtypes
Out[10]:
We might think that better apartments get rented more often, let's plot a scatter (or multiple boxes?) plot of the number of reviews vs the review score
In [11]:
df.plot.scatter(x="number_of_reviews", y="review_scores_rating", figsize=(10, 8), alpha=0.2)
Out[11]:
In [12]:
bins = [0, 5, 10, 25, 50, 100, 350]
boxplot_vecs = []
fig, ax = plt.subplots(figsize=(10, 8))
for i in range(1, 7):
lb = bins[i-1]
ub = bins[i]
foo = df["review_scores_rating"][df["number_of_reviews"].apply(lambda x: lb <= x <= ub)].dropna()
boxplot_vecs.append(foo.values)
ax.boxplot(boxplot_vecs, labels=bins[:-1])
ax.set_xlabel("Number of Reviews")
ax.set_ylabel("Review Score")
plt.show()
Better reviews also are correlated with higher prices
In [13]:
df.plot.scatter(x="review_scores_rating", y="price", figsize=(10, 8), alpha=0.2)
Out[13]:
In [14]:
df.plot.scatter(x="bedrooms", y="price", figsize=(10, 8))
Out[14]:
In [17]:
df[df["city"] == "New York"].plot.scatter(x="bedrooms", y="price")
Out[17]:
In [16]:
df[df["neighbourhood"] == "Upper West Side"].plot.scatter(x="bedrooms", y="price")
Out[16]:
In [ ]: