------------------------
(C) 08/2017 by Andy Mai
Email: andymai.ca@aol.com
San Jose, CA - Capital of Silicon Valley
------------------------
In this example, we will get information about apartments' rental prices in SF Bay Area (California)
In [1]:
import pandas as pd
%pylab inline
In [2]:
import requests
from bs4 import BeautifulSoup as bs4
In [3]:
url_base = 'https://sfbay.craigslist.org/search/apa'
params = dict(bedrooms=1, is_furnished=1)
resq = requests.get(url_base, params=params)
# We have the full URL:
print (resq.url)
We will use BeautifulSoup API to simplify the content of what we get from Craigslist
In [4]:
html = bs4(resq.text, 'html.parser')
print(html.prettify()[:900])
In [5]:
apts = html.find_all('p', attrs={'class':'result-info'})
print(len(apts))
apts is a list of 120 renting objects. We can take one out for our sample. We work on this sample to have the sense how to apply all these things to the whole list.
In [6]:
this_aprt = apts[15]
print(this_aprt)
In [7]:
size = this_aprt.find_all(attrs={'class':'housing'})[0].text
print(size)
In [8]:
def size_brs(size):
split_var = size.strip('\n ').strip('\n - ').split('-')
if len(split_var) == 2:
n_brs = split_var[0].replace('br', '')
apt_size = split_var[1].replace('ft2', '')
elif 'br' in split_var[0]:
n_brs = split_var[0].replace('br', '')
apt_size = np.nan
elif 'ft2' in split_var[1]:
apt_size = split_var[1].replace('ft2', '')
n_brs = np.nan
return float(apt_size), float(n_brs)
In [9]:
apt_size, n_brs = size_brs(size)
In [10]:
time = this_aprt.find('time')['datetime']
time = pd.to_datetime(time)
In [11]:
price = float(this_aprt.find('span', {'class':'result-price'}).text.strip('$'))
In [12]:
title = this_aprt.find('a', {'class':'result-title hdrlnk'}).text
In [13]:
link = this_aprt.find('a', {'class':'result-title hdrlnk'})['href']
In [14]:
print('\n'.join([str(i) for i in [apt_size, n_brs, time, price, title]]))
In [15]:
page_index = np.arange(0, 1000, 120)
In [16]:
def set_price(x): # Avoid getting None value
price = x.find('span', {'class':'result-price'})
if price is not None:
price = float(price.text.strip('$'))
else:
price = np.nan
return price
def set_time(x): # Avoid getting None value
time = x.find('time')['datetime']
if time is not None:
time = pd.to_datetime(time)
else:
time = np.nan
return time
In [17]:
import time as tp
result = []
for j in page_index:
url_base = 'https://sfbay.craigslist.org/search/apa'
params = dict(bedrooms=1, is_furnished=1,s=j)
resq = requests.get(url_base, params=params)
html = bs4(resq.text, 'html.parser')
apts = html.find_all('p', attrs={'class':'result-info'})
for this_aprt in apts:
size = this_aprt.find_all(attrs={'class':'housing'})[0].text
apt_size, n_brs = size_brs(size)
time = set_time(this_aprt)
price = set_price(this_aprt)
title = this_aprt.find('a', {'class':'result-title hdrlnk'}).text
link = this_aprt.find('a', {'class':'result-title hdrlnk'})['href']
data = np.array([time, price, apt_size, n_brs,title, link])
result.append(data)
tp.sleep(1)
In [18]:
col_names = ['time', 'price', 'size', 'brs', 'title', 'link']
result = pd.DataFrame(data=result, columns=col_names)
result.set_index('time')
result.head()
Out[18]:
In [19]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [21]:
plt.figure(figsize=(12,10))
sns.distplot(result.price.dropna(),bins=np.arange(0, 10000, 100),color='red',axlabel='SF Bay Area - Apartments\' Rental Price ($)').set(xlim=(0, 12000))
Out[21]: