In [11]:
import sys
import pandas as pd
import matplotlib as ml
import matplotlib.pyplot as plt
%matplotlib inline
In [5]:
print("Python version: ", sys.version)
print("Pandas version: ", pd.__version__)
print("Matplotlib version: ", ml.__version__)
First we read in the data
In [7]:
url1 = "http://data.insideairbnb.com/united-states/"
url2 = "ny/new-york-city/2016-02-02/data/listings.csv.gz"
full_df = pd.read_csv(url1+url2, compression="gzip")
full_df.head()
Out[7]:
We don't want all data, so let's focus on a few variables.
In [136]:
df = full_df[["id", "price", "number_of_reviews", "review_scores_rating"]]
df.head()
Out[136]:
Need to convert prices to floats
In [137]:
df.replace({'price': {'\$': ''}}, regex=True, inplace=True)
df.replace({'price': {'\,': ''}}, regex=True, inplace=True)
df['price'] = df['price'].astype('float64', copy=False)
We might think that better apartments get rented more often, let's plot a scatter (or multiple boxes?) plot of the number of reviews vs the review score
In [15]:
df.plot.scatter(x="number_of_reviews", y="review_scores_rating", figsize=(10, 8), alpha=0.2)
Out[15]:
In [56]:
bins = [0, 5, 10, 25, 50, 100, 350]
boxplot_vecs = []
fig, ax = plt.subplots(figsize=(10, 8))
for i in range(1, 7):
lb = bins[i-1]
ub = bins[i]
foo = df["review_scores_rating"][df["number_of_reviews"].apply(lambda x: lb <= x <= ub)].dropna()
boxplot_vecs.append(foo.values)
ax.boxplot(boxplot_vecs, labels=bins[:-1])
plt.show()
Better reviews also are correlated with higher prices
In [141]:
df.plot.scatter(x="review_scores_rating", y="price", figsize=(10, 8), alpha=0.2)
Out[141]:
In [ ]: