In [1]:
# Python 3.4
%pylab inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup as bs4
Look on the Craigslist website, select relevant search criteria, and then take a look at the web address:
Houses for sale in the East Bay:
http://sfbay.craigslist.org/search/eby/rea?housing_type=6
Houses for sale in selected neighborhoods in the East Bay:
http://sfbay.craigslist.org/search/eby/rea?nh=46&nh=47&nh=48&nh=49&nh=112&nh=54&nh=55&nh=60&nh=62&nh=63&nh=66&housing_type=6
In [2]:
# Get the data: Houses posted for sale on Craigslist in the Eastbay
url_base = 'http://sfbay.craigslist.org/search/eby/rea?housing_type=6'
data = requests.get(url_base)
print(data.url)
In [3]:
# BeautifulSoup can quickly parse the text, need to tell bs4 that the text is html
html = bs4(data.text, 'html.parser')
In [9]:
# Display the html in a somewhat readable way, to note the structure of housing listings
# then comment it out because it prints out a large amount to the screen
# print(html.prettify())
In [4]:
# Looked through above output and saw housing entries contained in <p class="row">
# Get a list of housing data and store the results
houses = html.find_all('p', attrs={'class': 'row'}) # html.findAll(attrs={'class': "row"})
print(len(houses))
In [7]:
# List neighborhoods of the houses in the list
neighborhoods = pd.DataFrame(data = ones(len(houses)), columns = {'Neighborhoods'})
n = 0
for row in range(len(houses)-1):
one_neighborhood = houses[n].findAll(attrs={'class': 'pnr'})[0].text
neighborhoods.iloc[n] = one_neighborhood
n += 1
#print(neighborhoods)
In [11]:
# There's a consistent structure to each housing listing:
# There is a 'time',
# a <span class="price">,
# a 'housing',
# a <span class="pnr"> neighborhood field
# Look at info for a single house
one_house = houses[11] # 11, 19, 28 is the selected row number for a housing listing
# Print out and view a single house entry, use prettify to make more legible
print(one_house.prettify())
A single housing entry looks like this:
OPEN House Sunday 2-4pm, For SALE Spacious 2 Bedroom, 1 Bathroom Home $440000 / 2br - 1156ft 2 - (Oakland) pic map
OPEN HOUSE SAT 1-4 Sunny Richmond Home 3br - 1330ft 2 - (richmond / point / annex) pic map
Millsmont House For Sale $579950 / 3br - 1912ft 2 - (oakland hills / mills) pic map
Excellent Home in Berkeley $450000 (berkeley) pic
Conveniently located in Albany $600000 (albany / el cerrito) pic
In [14]:
# For one housing entry look at fields of interest: Price, Neighborhood, Size, Date Posted
# Clean up values manually, to figure out how to automate
# Listing
allspecs = one_house.findAll(attrs={'class': 'l2'})[0].text # `findAll` returns a list, and there's only one entry in this html
print('Listing: \n', allspecs, '\n')
# Price
print('Price:')
price = one_house.findAll(attrs={'class': 'price'})[0].text
print(price)
price = float(one_house.find('span', {'class': 'price'}).text.strip('$'))
print(price, '\t', type(price), '\n')
# Neighborhood
print('Neighborhood:')
neighborhood = one_house.findAll(attrs={'class': 'pnr'})[0].text
print(neighborhood)
# Keep the neighborhood, remove leading spaces and parentheses.
# Then split at the closing parentheses and only take the neighborhood part
# example: ' (vallejo / benicia) pic map '
neighborhood = one_house.findAll(attrs={'class': 'pnr'})[0].text.strip(' (').split(')')[0]
print(neighborhood, '\t', type(neighborhood), '\n')
#print(len([rw.findAll(attrs={'class': 'pnr'})[0].text.strip(' (').split(')')[0] for rw in houses]))
# Size
print('Size: bedrooms and sq ft: ')
size = one_house.findAll(attrs={'class': 'housing'})[0].text
print(size)
# Strip text of leading and trailing characters: /, dashes, and spaces
# Split number of bedrooms and square footage into 2 fields in list
size = one_house.findAll(attrs={'class': 'housing'})[0].text.strip('/- ').split(' - ')
print(size)
# Delete suffixes and just keep the numbers
size[0] = float(size[0].replace('br', '')) # number of bedrooms
size[1] = float(size[1].replace('ft2', '')) # square footage
print(size, '\t', type(size[0]), '\n')
# Address/Posting Title
address = one_house.findAll(attrs={'class': 'hdrlnk'})[0].text
print(address, '\n')
#link = 'http://sfbay.craigslist.org/search' + one_house.findAll(attrs={'class': 'hdrlnk'})[0]['href']
#print(link, '\n')
# Date posted
dateposted = one_house.findAll(attrs={'class': 'pl'})[0].time['datetime']
print(dateposted, '\t', type(dateposted))
# Convert to datetime type so can extract date
date = pd.to_datetime(one_house.find('time')['datetime']).date()
print(date, '\t', type(date))
In [15]:
# Define 4 functions for the price, neighborhood, sq footage & # bedrooms, and time
# that can deal with missing values (to prevent errors from showing up when running the code)
# Prices
def find_prices(results):
prices = []
for rw in results:
price = rw.find('span', {'class': 'price'})
if price is not None:
price = float(price.text.strip('$'))
else:
price = np.nan
prices.append(price)
return prices
# Neighborhoods
# Example: ' (oakland hills / mills) pic map '
# Define a function for neighborhood in case a field is missing in 'class': 'pnr'
def find_neighborhood(results):
neighborhoods = []
for rw in results:
split = rw.find('span', {'class': 'pnr'}).text.strip(' (').split(')')
#split = rw.find(attrs={'class': 'pnr'}).text.strip(' (').split(')')
if len(split) == 2:
neighborhood = split[0]
elif 'pic map' or 'pic' or 'map' in split[0]:
neighborhood = np.nan
neighborhoods.append(neighborhood)
return neighborhoods
# Size
# Make a function to deal with size in case #br or ft2 is missing
def find_size_and_brs(results):
sqft = []
bedrooms = []
for rw in results:
split = rw.find('span', attrs={'class': 'housing'})
# If the field doesn't exist altogether in a housing entry
if split is not None:
#if rw.find('span', {'class': 'housing'}) is not None:
# Removes leading and trailing spaces and dashes, splits br & ft
#split = rw.find('span', attrs={'class': 'housing'}).text.strip('/- ').split(' - ')
split = split.text.strip('/- ').split(' - ')
if len(split) == 2:
n_brs = split[0].replace('br', '')
size = split[1].replace('ft2', '')
elif 'br' in split[0]: # in case 'size' field is missing
n_brs = split[0].replace('br', '')
size = np.nan
elif 'ft2' in split[0]: # in case 'br' field is missing
size = split[0].replace('ft2', '')
n_brs = np.nan
else:
size = np.nan
n_brs = np.nan
sqft.append(float(size))
bedrooms.append(float(n_brs))
return sqft, bedrooms
# Time posted
def find_times(results):
times = []
for rw in results:
time = rw.findAll(attrs={'class': 'pl'})[0].time['datetime']
if time is not None:
time# = time
else:
time = np.nan
times.append(time)
return pd.to_datetime(times)
In [16]:
prices = find_prices(houses)
neighborhoods = find_neighborhood(houses)
sqft, bedrooms = find_size_and_brs(houses)
times = find_times(houses)
# Check
print(len(prices))
print(len(neighborhoods))
print(len(sqft))
print(len(bedrooms))
print(len(times))
In [18]:
# Add the data to a dataframe so I can work with it
housesdata = np.array([prices, sqft, bedrooms]).T
#print(housesdata)
# Add the array to the dataframe, then the dates column and the neighborhoods column
housesdf = pd.DataFrame(data = housesdata, columns = ['Price', 'SqFeet', 'nBedrooms'])
housesdf['DatePosted'] = times
housesdf['Neighborhood'] = neighborhoods
print(housesdf.tail(5))
In [19]:
print(housesdf.dtypes)
In [22]:
# Quick plot to look at the data
fig = plt.figure()
fig.set_figheight(6.0)
fig.set_figwidth(10.0)
ax = fig.add_subplot(111) # row column position
ax.plot(housesdf.SqFeet, housesdf.Price, 'bo')
ax.set_xlim(0,5000)
ax.set_ylim(0,3000000)
ax.set_xlabel('$\mathrm{Square \; feet}$',fontsize=18)
ax.set_ylabel('$\mathrm{Price \; (in \; \$)}$',fontsize=18)
len(housesdf.SqFeet)
Out[22]:
In [23]:
# Quick plot to look at the data
fig = plt.figure()
fig.set_figheight(6.0)
fig.set_figwidth(10.0)
ax = fig.add_subplot(111) # row column position
ax.plot(housesdf.nBedrooms, housesdf.Price, 'bo')
ax.set_xlim(1.5, 5.5)
ax.set_ylim(0,3000000)
ax.set_xlabel('$\mathrm{Number \; of \; Bedrooms}$',fontsize=18)
ax.set_ylabel('$\mathrm{Price \; (in \; \$)}$',fontsize=18)
len(housesdf.nBedrooms)
Out[23]:
In [29]:
# Get houses listed in Berkeley
#housesdf[housesdf['Neighborhood'] == 'berkeley']
housesdf[housesdf['Neighborhood'] == 'berkeley north / hills']
#housesdf[housesdf['Neighborhood'] == 'oakland rockridge / claremont']
#housesdf[housesdf['Neighborhood'] == 'albany / el cerrito']
#housesdf[housesdf['Neighborhood'] == 'richmond / point / annex']
Out[29]:
In [46]:
# How many houses for sale are under $700k?
print(housesdf[(housesdf.Price < 700000)].count(), '\n') # nulls aren't counted in count
# In which neighborhoods are these houses located?
print(set(housesdf[(housesdf.Price < 700000)].Neighborhood))
# Return entries for houses under $700k, sorted by price from least expensive to most
housesdf[(housesdf.Price < 700000)].sort_values(['Price'], ascending = [True])
Out[46]:
In [47]:
by_neighborhood = housesdf.groupby('Neighborhood')
print(by_neighborhood.count())#.head()) # NOT NULL records within each column
#print('\n')
#print(by_neighborhood.size())#.head()) # total records for each neighborhood
#by_neighborhood.Neighborhood.nunique()
In [48]:
print(len(housesdf.index)) # total #rows
print(len(set(housesdf.Neighborhood))) # #unique neighborhoods
set(housesdf.Neighborhood) # list the #unique neighborhoods
Out[48]:
In [50]:
# Group the results by neighborhood, and then take the average home price in each neighborhood
by_neighborhood = housesdf.groupby('Neighborhood').mean().Price # by_neighborhood_mean_price
print(by_neighborhood.head(5), '\n')
print(by_neighborhood['berkeley north / hills'], '\n')
#print(by_neighborhood.index, '\n')
by_neighborhood_sort_price = by_neighborhood.sort_values(ascending = True)
#print(by_neighborhood_sort_price.index) # a list of the neighborhoods sorted by price
print(by_neighborhood_sort_price)
In [56]:
# Plot average home price for each neighborhood in the East Bay
# dropna()
fig = plt.figure() # or fig = plt.figure(figsize=(15,8)) # width, height
fig.set_figheight(8.0)
fig.set_figwidth(13.0)
ax = fig.add_subplot(111) # row column position
fntsz=20
titlefntsz=25
lablsz=20
mrkrsz=8
matplotlib.rc('xtick', labelsize = lablsz); matplotlib.rc('ytick', labelsize = lablsz)
# Choose a baseline, based on proximity to current location
# 'berkeley', 'berkeley north / hills', 'albany / el cerrito'
neighborhood_name = 'berkeley north / hills'
# Plot a bar chart
ax.bar(range(len(by_neighborhood_sort_price.dropna())), by_neighborhood_sort_price.dropna(), align='center')
# Add a horizontal line for Berkeley's (or the baseline's) average home price, corresponds with Berkeley bar
ax.axhline(y=housesdf.groupby('Neighborhood').mean().Price.ix[neighborhood_name], linestyle='--')
# Add a grid
ax.grid(b = True, which='major', axis='y') # which='major','both'; options/kwargs: color='r', linestyle='-', linewidth=2)
# Format x axis
ax.set_xticks(range(1,len(housesdf.groupby('Neighborhood').mean().Price.dropna()))); # 0 if first row is at least 100,000
ax.set_xticklabels(by_neighborhood_sort_price.dropna().index[1:], rotation='vertical', fontsize=fntsz) # remove [1:], 90, 45, 'vertical'
ax.set_xlim(0, len(by_neighborhood_sort_price.dropna().index)) # -1 if first row is at least 100,000
# Format y axis
minor_yticks = np.arange(0, 2000000, 100000)
ax.set_yticks(minor_yticks, minor = True)
ax.tick_params(axis='y', labelsize=fntsz)
ax.set_ylabel('$\mathrm{Price \; (Dollars)}$', fontsize = titlefntsz)
# Set figure title
ax.set_title('$\mathrm{Average \; Home \; Prices \; in \; the \; East \; Bay \; (Source: Craigslist)}$', fontsize = titlefntsz)
# Save figure
#plt.savefig("home_prices.pdf", bbox_inches='tight')
# Home prices in Berkeley (or the baseline)
print('The average home price in %s is: $' %neighborhood_name, '{0:8,.0f}'.format(housesdf.groupby('Neighborhood').mean().Price.ix[neighborhood_name]), '\n')
print('The most expensive home price in %s is: $' %neighborhood_name, '{0:8,.0f}'.format(housesdf.groupby('Neighborhood').max().Price.ix[neighborhood_name]), '\n')
print('The least expensive home price in %s is: $' %neighborhood_name, '{0:9,.0f}'.format(housesdf.groupby('Neighborhood').min().Price.ix[neighborhood_name]), '\n')
In [ ]:
Copyright Jennifer L. Jones, 2015, 2016