In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import BeautifulSoup as soup
import json
import time

In [1]:
url = 'http://www.zillow.com/homes/recently_sold/Los-Angeles-CA/12447_rid/0-625000_price/0-2325_mp/34.087782,-118.175297,33.94749,-118.563595_rect/11_zm/'
#http://www.zillow.com/homes/recently_sold/Los-Angeles-CA/12447_rid/0-625000_price/0-2325_mp/34.078683,-118.219242,33.956318,-118.51965_rect/11_zm/2_p/

In [81]:
cache = {}

In [197]:
def xml2listing(articles):
    id = articles.get('id')
    statustype = articles.get('statustype')
    latitude = articles.get('latitude')
    longitude = articles.get('longitude')
    href = ''
    address = ''
    numphotos = ''
    bath = ''
    title = ''
    image = ''
    bed = ''
    label = ''
    datasize = ''
    sqft = ''
    id = ''
    statustype = ''
    for elem in articles:
        if str(type(elem)) == "<class 'BeautifulSoup.Tag'>":
            if elem.name == 'figure':
                a = elem.contents[0]
                if str(type(a)) == "<class 'BeautifulSoup.Tag'>":
                    href = a.get('href')
                    address = a.contents[0].get('alt')
                    if len(a.contents) > 1:
                        numphotos = a.contents[1].contents[0]
            elif elem.name == 'div':
                cls = elem.get('class')
                if cls == 'minibubble template hide':
                    jdata = elem.contents[0]
                    jdata = jdata.replace('\\\\/', '/').replace('\\', '')
                    j = json.loads(jdata)
                    bath = j['bath']
                    title = j['title']
                    image = j['image']
                    bed = j['bed']
                    label = j['label']
                    datasize = j['datasize']
                    sqft = j['sqft']
                elif cls == 'property-listing-data':
                    continue
                elif cls == 'terse-list-card-actions':
                    continue
                else:
                    continue
        else:
            continue
    listing = {}
    listing['id'] = id
    listing['statustype'] = statustype
    listing['latitude'] = latitude
    listing['longitude'] = longitude
    listing['href'] = href
    listing['address'] = address
    listing['numphotos'] = numphotos
    listing['bath'] = bath
    listing['title'] = title
    listing['image'] = image
    listing['bed'] = bed
    listing['label'] = label
    listing['datasize'] = datasize
    listing['sqft'] = sqft
    return listing

In [ ]:
TODO: sold on, sold price

In [204]:
p = 1
listings = []
while p < 25000:
    print p
    url2 = url
    if p > 1:
        url2 += '/' + str(p) + '_p'
    try:
        r = cache[url2]
    except KeyError:
        r = requests.get(url2)
        cache[url2] = r
        time.sleep(1)
    s = soup.BeautifulSoup(r.content)
    articles = s.find('article')
    if articles:
        listing = xml2listing(articles)
        print '.'
        listings.append(listing)
    p += 1


1
.
2
.
3
.
4
.
5
.
6
.
7
.
8
.
9
.
10
.
11
.
12
.
13
.
14
.
15
.
16
.
17
.
18
.
19
.
20
.
21
.
22
.
23
.
24
.
25
.
26
.
27
.
28
.
29
.
30
.
31
.
32
.
33
.
34
.
35
.
36
.
37
.
38
.
39
.
40
.
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-204-33344b629c3b> in <module>()
     12         cache[url2] = r
     13         time.sleep(1)
---> 14     s = soup.BeautifulSoup(r.content)
     15     articles = s.find('article')
     16     if articles:

/Users/kylepolich/anaconda/lib/python2.7/site-packages/BeautifulSoup.pyc in __init__(self, *args, **kwargs)
   1520             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
   1521         kwargs['isHTML'] = True
-> 1522         BeautifulStoneSoup.__init__(self, *args, **kwargs)
   1523 
   1524     SELF_CLOSING_TAGS = buildTagMap(None,

/Users/kylepolich/anaconda/lib/python2.7/site-packages/BeautifulSoup.pyc in __init__(self, markup, parseOnlyThese, fromEncoding, markupMassage, smartQuotesTo, convertEntities, selfClosingTags, isHTML)
   1145         self.markupMassage = markupMassage
   1146         try:
-> 1147             self._feed(isHTML=isHTML)
   1148         except StopParsing:
   1149             pass

/Users/kylepolich/anaconda/lib/python2.7/site-packages/BeautifulSoup.pyc in _feed(self, inDocumentEncoding, isHTML)
   1187         self.reset()
   1188 
-> 1189         SGMLParser.feed(self, markup)
   1190         # Close out any unfinished strings and close all the open tags.
   1191         self.endData()

/Users/kylepolich/anaconda/lib/python2.7/sgmllib.pyc in feed(self, data)
    102 
    103         self.rawdata = self.rawdata + data
--> 104         self.goahead(0)
    105 
    106     def close(self):

/Users/kylepolich/anaconda/lib/python2.7/sgmllib.pyc in goahead(self, end)
    136                         i = i+1
    137                         continue
--> 138                     k = self.parse_starttag(i)
    139                     if k < 0: break
    140                     i = k

/Users/kylepolich/anaconda/lib/python2.7/sgmllib.pyc in parse_starttag(self, i)
    294             j = j+1
    295         self.__starttag_text = rawdata[start_pos:j]
--> 296         self.finish_starttag(tag, attrs)
    297         return j
    298 

/Users/kylepolich/anaconda/lib/python2.7/sgmllib.pyc in finish_starttag(self, tag, attrs)
    336                 method = getattr(self, 'do_' + tag)
    337             except AttributeError:
--> 338                 self.unknown_starttag(tag, attrs)
    339                 return -1
    340             else:

/Users/kylepolich/anaconda/lib/python2.7/site-packages/BeautifulSoup.pyc in unknown_starttag(self, name, attrs, selfClosing)
   1338         self.endData()
   1339 
-> 1340         if not self.isSelfClosingTag(name) and not selfClosing:
   1341             self._smartPop(name)
   1342 

/Users/kylepolich/anaconda/lib/python2.7/site-packages/BeautifulSoup.pyc in isSelfClosingTag(self, name)
   1210         self-closing tag according to this parser."""
   1211         return self.SELF_CLOSING_TAGS.has_key(name) \
-> 1212                or self.instanceSelfClosingTags.has_key(name)
   1213 
   1214     def reset(self):

KeyboardInterrupt: 

In [210]:
s = soup.BeautifulSoup(r.content)
articles = s.find('article')

In [211]:
xml2listing(articles)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-211-14d0d1ddc9b8> in <module>()
----> 1 xml2listing(articles)

<ipython-input-197-6deea68489b6> in xml2listing(articles)
      1 def xml2listing(articles):
----> 2     id = articles.get('id')
      3     statustype = articles.get('statustype')
      4     latitude = articles.get('latitude')
      5     longitude = articles.get('longitude')

AttributeError: 'NoneType' object has no attribute 'get'

In [201]:
df = pd.DataFrame(listings)

In [203]:
len(listings)


Out[203]:
40

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [112]:
%matplotlib inline
import matplotlib.pyplot as plt
import requests
import xml.etree.ElementTree
import pandas as pd

In [5]:


In [107]:
zipcodes = ['90024']

In [108]:
rows = []
for zipcode in zipcodes:
    url = 'http://api.trulia.com/webservices.php?library=TruliaStats&function=getZipCodeStats&zipCode=' + zipcode + '&state=CA&startDate=2015-01-01&endDate=2016-01-11&apikey=' + key
    r = requests.get(url)
    fname = 'data.xml'
    f = open(fname, 'w')
    f.write(r.text)
    f.close()
    e = xml.etree.ElementTree.parse(fname).getroot()
    stats = e.find('response').find('TruliaStats').find('trafficStats').findall('trafficStat')
    for stat in stats:
        row = [zipcode, stat.find('date').text, stat.find('percentStateTraffic').text, stat.find('percentNationalTraffic').text]
        rows.append(row)

In [115]:
df = pd.DataFrame(rows, columns=['zipcode', 'date', 'pStateTraffic', 'pNationalTraffic'])
plt.plot(df['pStateTraffic'])


Out[115]:
[<matplotlib.lines.Line2D at 0x109d5a110>]

In [17]:


In [58]:


In [ ]:


In [137]:
rows = []

listingStats = e.find('response').find('TruliaStats').find('listingStats').findall('listingStat')
for listingStat in listingStats:
    week = listingStat.find('weekEndingDate').text
    subcategories = listingStat.find('listingPrice').findall('subcategory')
    row = {'week': week}
    for subcat in subcategories:
        t = subcat.find('type').text
        if t.find('Bedroom Properties') == 2:
            br = t[0:2].strip()
            num = subcat.find('numberOfProperties').text
            median = subcat.find('medianListingPrice').text
            mean = subcat.find('averageListingPrice').text
            row['num_' + br] = num
            row['median_' + br] = median
            row['mean_' + br] = mean
    rows.append(row)

In [140]:
df = pd.DataFrame(rows)

In [143]:
plt.plot(df['median_2'])


Out[143]:
[<matplotlib.lines.Line2D at 0x10a2d6a10>]

In [141]:
df.head()


Out[141]:
mean_1 mean_2 mean_3 mean_4 mean_5 mean_6 mean_8 median_1 median_2 median_3 ... median_6 median_8 num_1 num_2 num_3 num_4 num_5 num_6 num_8 week
0 529389 1571835 3303602 3665592 3134107 NaN 2995000 549000 1013500 2881571 ... NaN 2995000 9 61 20 6 3 NaN 1 2015-01-03
1 531504 1550382 3275355 3794154 3134107 NaN 2995000 549000 989286 2429643 ... NaN 2995000 9 63 24 8 3 NaN 1 2015-01-10
2 545462 1566104 3275062 3748120 3561429 NaN 2995000 546271 986636 2703857 ... NaN 2995000 12 69 27 9 2 NaN 1 2015-01-17
3 552524 1556027 3242167 3845129 3872500 NaN 2995000 548000 987121 2851000 ... NaN 2995000 14 73 27 10 2 NaN 1 2015-01-24
4 529364 1567261 3206141 3800777 3885714 NaN 2995000 531671 984836 2846071 ... NaN 2995000 16 71 27 10 2 NaN 1 2015-01-31

5 rows × 22 columns