This is a crawl for a site that has "for sale by owner" listings. Unfortunately, the volume of listings is to low to be particularly interesting for me. I did this anyway to keep an eye on the data because a single transaction (the absolute best one) would still be worth the investment of time.
In [77]:
import requests
import BeautifulSoup as bsoup
import pandas as pd
import time
import datetime
In [72]:
def extract_listings(txt):
plistings = []
s = bsoup.BeautifulSoup(txt)
listings = s.findAll('div', {'class': 'estate-info'})
for listing in listings:
price = listing.find('div', {'class': 'estateSummary-price mix-estateSummary_SM-price_sm'}).text
title = listing.find('div', {'class': 'estateSummary-title mix-estateSummary_SM-title_sm'}).text
address = listing.find('div', {'class': 'estateSummary-address'}).text
elems = listing.findAll('div', {'class': 'estateSummary-list'})
beds = 0
baths = 0
sqft = 0
htype = ''
lastUpdated = listing.find('em', {'class': 'highlight-text isHiddenSM'})
if lastUpdated is None:
lastUpdated = ''
else:
lastUpdated = lastUpdated.text.replace('Last updated ', '')
for elem in elems:
elems2 = elem.findAll('div')
for elem2 in elems2:
txt = elem2.text
if txt.find('Beds') > 0:
beds = float(txt[0:txt.find('Beds')].strip())
elif txt.find('Baths') > 0:
baths = float(txt[0:txt.find('Baths')].strip())
elif txt.find('Sqft') > 0:
sqft = int(txt[0:txt.find('Sqft')].replace(',', '').strip())
else:
htype = txt
plisting = {'price': price, 'title': title, 'lastUpdated': lastUpdated, 'address': address, 'beds': beds, 'baths': baths, 'sqft': sqft, 'htype': htype}
plistings.append(plisting)
return plistings
In [73]:
pnum = 1
test = -1
plistings = []
while test == -1:
r = requests.get('http://www.forsalebyowner.com/search/list/los-angeles-california/house,condo-types/' + str(pnum) + '-page/proximity,desc-sort')
time.sleep(.5)
test = r.text.find('Your search did not yield any results.')
if test == -1:
plistings.extend(extract_listings(r.text))
pnum += 1
df = pd.DataFrame(plistings)
In [74]:
len(plistings)
Out[74]:
In [75]:
df
Out[75]:
In [76]:
df.sort('price')
Out[76]:
In [88]:
dt = datetime.datetime.now()
dt = str(dt.year) + '-' + str(dt.month).zfill(2) + '-' + str(dt.day).zfill(2)
df.to_csv('out_' + dt + '.tsv', index=False, sep='\t')