In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import BeautifulSoup as soup
import json
import time
In [1]:
url = 'http://www.zillow.com/homes/recently_sold/Los-Angeles-CA/12447_rid/0-625000_price/0-2325_mp/34.087782,-118.175297,33.94749,-118.563595_rect/11_zm/'
#http://www.zillow.com/homes/recently_sold/Los-Angeles-CA/12447_rid/0-625000_price/0-2325_mp/34.078683,-118.219242,33.956318,-118.51965_rect/11_zm/2_p/
In [81]:
cache = {}
In [197]:
def xml2listing(articles):
id = articles.get('id')
statustype = articles.get('statustype')
latitude = articles.get('latitude')
longitude = articles.get('longitude')
href = ''
address = ''
numphotos = ''
bath = ''
title = ''
image = ''
bed = ''
label = ''
datasize = ''
sqft = ''
id = ''
statustype = ''
for elem in articles:
if str(type(elem)) == "<class 'BeautifulSoup.Tag'>":
if elem.name == 'figure':
a = elem.contents[0]
if str(type(a)) == "<class 'BeautifulSoup.Tag'>":
href = a.get('href')
address = a.contents[0].get('alt')
if len(a.contents) > 1:
numphotos = a.contents[1].contents[0]
elif elem.name == 'div':
cls = elem.get('class')
if cls == 'minibubble template hide':
jdata = elem.contents[0]
jdata = jdata.replace('\\\\/', '/').replace('\\', '')
j = json.loads(jdata)
bath = j['bath']
title = j['title']
image = j['image']
bed = j['bed']
label = j['label']
datasize = j['datasize']
sqft = j['sqft']
elif cls == 'property-listing-data':
continue
elif cls == 'terse-list-card-actions':
continue
else:
continue
else:
continue
listing = {}
listing['id'] = id
listing['statustype'] = statustype
listing['latitude'] = latitude
listing['longitude'] = longitude
listing['href'] = href
listing['address'] = address
listing['numphotos'] = numphotos
listing['bath'] = bath
listing['title'] = title
listing['image'] = image
listing['bed'] = bed
listing['label'] = label
listing['datasize'] = datasize
listing['sqft'] = sqft
return listing
In [ ]:
TODO: sold on, sold price
In [204]:
p = 1
listings = []
while p < 25000:
print p
url2 = url
if p > 1:
url2 += '/' + str(p) + '_p'
try:
r = cache[url2]
except KeyError:
r = requests.get(url2)
cache[url2] = r
time.sleep(1)
s = soup.BeautifulSoup(r.content)
articles = s.find('article')
if articles:
listing = xml2listing(articles)
print '.'
listings.append(listing)
p += 1
In [210]:
s = soup.BeautifulSoup(r.content)
articles = s.find('article')
In [211]:
xml2listing(articles)
In [201]:
df = pd.DataFrame(listings)
In [203]:
len(listings)
Out[203]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [112]:
%matplotlib inline
import matplotlib.pyplot as plt
import requests
import xml.etree.ElementTree
import pandas as pd
In [5]:
In [107]:
zipcodes = ['90024']
In [108]:
rows = []
for zipcode in zipcodes:
url = 'http://api.trulia.com/webservices.php?library=TruliaStats&function=getZipCodeStats&zipCode=' + zipcode + '&state=CA&startDate=2015-01-01&endDate=2016-01-11&apikey=' + key
r = requests.get(url)
fname = 'data.xml'
f = open(fname, 'w')
f.write(r.text)
f.close()
e = xml.etree.ElementTree.parse(fname).getroot()
stats = e.find('response').find('TruliaStats').find('trafficStats').findall('trafficStat')
for stat in stats:
row = [zipcode, stat.find('date').text, stat.find('percentStateTraffic').text, stat.find('percentNationalTraffic').text]
rows.append(row)
In [115]:
df = pd.DataFrame(rows, columns=['zipcode', 'date', 'pStateTraffic', 'pNationalTraffic'])
plt.plot(df['pStateTraffic'])
Out[115]:
In [17]:
In [58]:
In [ ]:
In [137]:
rows = []
listingStats = e.find('response').find('TruliaStats').find('listingStats').findall('listingStat')
for listingStat in listingStats:
week = listingStat.find('weekEndingDate').text
subcategories = listingStat.find('listingPrice').findall('subcategory')
row = {'week': week}
for subcat in subcategories:
t = subcat.find('type').text
if t.find('Bedroom Properties') == 2:
br = t[0:2].strip()
num = subcat.find('numberOfProperties').text
median = subcat.find('medianListingPrice').text
mean = subcat.find('averageListingPrice').text
row['num_' + br] = num
row['median_' + br] = median
row['mean_' + br] = mean
rows.append(row)
In [140]:
df = pd.DataFrame(rows)
In [143]:
plt.plot(df['median_2'])
Out[143]:
In [141]:
df.head()
Out[141]: