notebook.community

Edit and run



In [2]:

    
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
import BeautifulSoup as soup
import json
import time



In [1]:

    
url = 'http://www.zillow.com/homes/recently_sold/Los-Angeles-CA/12447_rid/0-625000_price/0-2325_mp/34.087782,-118.175297,33.94749,-118.563595_rect/11_zm/'
#http://www.zillow.com/homes/recently_sold/Los-Angeles-CA/12447_rid/0-625000_price/0-2325_mp/34.078683,-118.219242,33.956318,-118.51965_rect/11_zm/2_p/



In [81]:

    
cache = {}



In [197]:

    
def xml2listing(articles):
    id = articles.get('id')
    statustype = articles.get('statustype')
    latitude = articles.get('latitude')
    longitude = articles.get('longitude')
    href = ''
    address = ''
    numphotos = ''
    bath = ''
    title = ''
    image = ''
    bed = ''
    label = ''
    datasize = ''
    sqft = ''
    id = ''
    statustype = ''
    for elem in articles:
        if str(type(elem)) == "<class 'BeautifulSoup.Tag'>":
            if elem.name == 'figure':
                a = elem.contents[0]
                if str(type(a)) == "<class 'BeautifulSoup.Tag'>":
                    href = a.get('href')
                    address = a.contents[0].get('alt')
                    if len(a.contents) > 1:
                        numphotos = a.contents[1].contents[0]
            elif elem.name == 'div':
                cls = elem.get('class')
                if cls == 'minibubble template hide':
                    jdata = elem.contents[0]
                    jdata = jdata.replace('\\\\/', '/').replace('\\', '')
                    j = json.loads(jdata)
                    bath = j['bath']
                    title = j['title']
                    image = j['image']
                    bed = j['bed']
                    label = j['label']
                    datasize = j['datasize']
                    sqft = j['sqft']
                elif cls == 'property-listing-data':
                    continue
                elif cls == 'terse-list-card-actions':
                    continue
                else:
                    continue
        else:
            continue
    listing = {}
    listing['id'] = id
    listing['statustype'] = statustype
    listing['latitude'] = latitude
    listing['longitude'] = longitude
    listing['href'] = href
    listing['address'] = address
    listing['numphotos'] = numphotos
    listing['bath'] = bath
    listing['title'] = title
    listing['image'] = image
    listing['bed'] = bed
    listing['label'] = label
    listing['datasize'] = datasize
    listing['sqft'] = sqft
    return listing



In [ ]:

    
TODO: sold on, sold price



In [204]:

    
p = 1
listings = []
while p < 25000:
    print p
    url2 = url
    if p > 1:
        url2 += '/' + str(p) + '_p'
    try:
        r = cache[url2]
    except KeyError:
        r = requests.get(url2)
        cache[url2] = r
        time.sleep(1)
    s = soup.BeautifulSoup(r.content)
    articles = s.find('article')
    if articles:
        listing = xml2listing(articles)
        print '.'
        listings.append(listing)
    p += 1









    



1
.
2
.
3
.
4
.
5
.
6
.
7
.
8
.
9
.
10
.
11
.
12
.
13
.
14
.
15
.
16
.
17
.
18
.
19
.
20
.
21
.
22
.
23
.
24
.
25
.
26
.
27
.
28
.
29
.
30
.
31
.
32
.
33
.
34
.
35
.
36
.
37
.
38
.
39
.
40
.
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-204-33344b629c3b> in <module>()
     12         cache[url2] = r
     13         time.sleep(1)
---> 14     s = soup.BeautifulSoup(r.content)
     15     articles = s.find('article')
     16     if articles:

/Users/kylepolich/anaconda/lib/python2.7/site-packages/BeautifulSoup.pyc in __init__(self, *args, **kwargs)
   1520             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
   1521         kwargs['isHTML'] = True
-> 1522         BeautifulStoneSoup.__init__(self, *args, **kwargs)
   1523 
   1524     SELF_CLOSING_TAGS = buildTagMap(None,

/Users/kylepolich/anaconda/lib/python2.7/site-packages/BeautifulSoup.pyc in __init__(self, markup, parseOnlyThese, fromEncoding, markupMassage, smartQuotesTo, convertEntities, selfClosingTags, isHTML)
   1145         self.markupMassage = markupMassage
   1146         try:
-> 1147             self._feed(isHTML=isHTML)
   1148         except StopParsing:
   1149             pass

/Users/kylepolich/anaconda/lib/python2.7/site-packages/BeautifulSoup.pyc in _feed(self, inDocumentEncoding, isHTML)
   1187         self.reset()
   1188 
-> 1189         SGMLParser.feed(self, markup)
   1190         # Close out any unfinished strings and close all the open tags.
   1191         self.endData()

/Users/kylepolich/anaconda/lib/python2.7/sgmllib.pyc in feed(self, data)
    102 
    103         self.rawdata = self.rawdata + data
--> 104         self.goahead(0)
    105 
    106     def close(self):

/Users/kylepolich/anaconda/lib/python2.7/sgmllib.pyc in goahead(self, end)
    136                         i = i+1
    137                         continue
--> 138                     k = self.parse_starttag(i)
    139                     if k < 0: break
    140                     i = k

/Users/kylepolich/anaconda/lib/python2.7/sgmllib.pyc in parse_starttag(self, i)
    294             j = j+1
    295         self.__starttag_text = rawdata[start_pos:j]
--> 296         self.finish_starttag(tag, attrs)
    297         return j
    298 

/Users/kylepolich/anaconda/lib/python2.7/sgmllib.pyc in finish_starttag(self, tag, attrs)
    336                 method = getattr(self, 'do_' + tag)
    337             except AttributeError:
--> 338                 self.unknown_starttag(tag, attrs)
    339                 return -1
    340             else:

/Users/kylepolich/anaconda/lib/python2.7/site-packages/BeautifulSoup.pyc in unknown_starttag(self, name, attrs, selfClosing)
   1338         self.endData()
   1339 
-> 1340         if not self.isSelfClosingTag(name) and not selfClosing:
   1341             self._smartPop(name)
   1342 

/Users/kylepolich/anaconda/lib/python2.7/site-packages/BeautifulSoup.pyc in isSelfClosingTag(self, name)
   1210         self-closing tag according to this parser."""
   1211         return self.SELF_CLOSING_TAGS.has_key(name) \
-> 1212                or self.instanceSelfClosingTags.has_key(name)
   1213 
   1214     def reset(self):

KeyboardInterrupt:



In [210]:

    
s = soup.BeautifulSoup(r.content)
articles = s.find('article')



In [211]:

    
xml2listing(articles)









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-211-14d0d1ddc9b8> in <module>()
----> 1 xml2listing(articles)

<ipython-input-197-6deea68489b6> in xml2listing(articles)
      1 def xml2listing(articles):
----> 2     id = articles.get('id')
      3     statustype = articles.get('statustype')
      4     latitude = articles.get('latitude')
      5     longitude = articles.get('longitude')

AttributeError: 'NoneType' object has no attribute 'get'



In [201]:

    
df = pd.DataFrame(listings)



In [203]:

    
len(listings)









    Out[203]:





40



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [112]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import requests
import xml.etree.ElementTree
import pandas as pd



In [5]:



In [107]:

    
zipcodes = ['90024']



In [108]:

    
rows = []
for zipcode in zipcodes:
    url = 'http://api.trulia.com/webservices.php?library=TruliaStats&function=getZipCodeStats&zipCode=' + zipcode + '&state=CA&startDate=2015-01-01&endDate=2016-01-11&apikey=' + key
    r = requests.get(url)
    fname = 'data.xml'
    f = open(fname, 'w')
    f.write(r.text)
    f.close()
    e = xml.etree.ElementTree.parse(fname).getroot()
    stats = e.find('response').find('TruliaStats').find('trafficStats').findall('trafficStat')
    for stat in stats:
        row = [zipcode, stat.find('date').text, stat.find('percentStateTraffic').text, stat.find('percentNationalTraffic').text]
        rows.append(row)



In [115]:

    
df = pd.DataFrame(rows, columns=['zipcode', 'date', 'pStateTraffic', 'pNationalTraffic'])
plt.plot(df['pStateTraffic'])









    Out[115]:





[<matplotlib.lines.Line2D at 0x109d5a110>]



In [17]:



In [58]:



In [ ]:



In [137]:

    
rows = []

listingStats = e.find('response').find('TruliaStats').find('listingStats').findall('listingStat')
for listingStat in listingStats:
    week = listingStat.find('weekEndingDate').text
    subcategories = listingStat.find('listingPrice').findall('subcategory')
    row = {'week': week}
    for subcat in subcategories:
        t = subcat.find('type').text
        if t.find('Bedroom Properties') == 2:
            br = t[0:2].strip()
            num = subcat.find('numberOfProperties').text
            median = subcat.find('medianListingPrice').text
            mean = subcat.find('averageListingPrice').text
            row['num_' + br] = num
            row['median_' + br] = median
            row['mean_' + br] = mean
    rows.append(row)



In [140]:

    
df = pd.DataFrame(rows)



In [143]:

    
plt.plot(df['median_2'])









    Out[143]:





[<matplotlib.lines.Line2D at 0x10a2d6a10>]



In [141]:

    
df.head()









    Out[141]:






  
    
      
      mean_1
      mean_2
      mean_3
      mean_4
      mean_5
      mean_6
      mean_8
      median_1
      median_2
      median_3
      ...
      median_6
      median_8
      num_1
      num_2
      num_3
      num_4
      num_5
      num_6
      num_8
      week
    
  
  
    
      0
       529389
       1571835
       3303602
       3665592
       3134107
       NaN
       2995000
       549000
       1013500
       2881571
      ...
       NaN
       2995000
        9
       61
       20
        6
       3
       NaN
       1
       2015-01-03
    
    
      1
       531504
       1550382
       3275355
       3794154
       3134107
       NaN
       2995000
       549000
        989286
       2429643
      ...
       NaN
       2995000
        9
       63
       24
        8
       3
       NaN
       1
       2015-01-10
    
    
      2
       545462
       1566104
       3275062
       3748120
       3561429
       NaN
       2995000
       546271
        986636
       2703857
      ...
       NaN
       2995000
       12
       69
       27
        9
       2
       NaN
       1
       2015-01-17
    
    
      3
       552524
       1556027
       3242167
       3845129
       3872500
       NaN
       2995000
       548000
        987121
       2851000
      ...
       NaN
       2995000
       14
       73
       27
       10
       2
       NaN
       1
       2015-01-24
    
    
      4
       529364
       1567261
       3206141
       3800777
       3885714
       NaN
       2995000
       531671
        984836
       2846071
      ...
       NaN
       2995000
       16
       71
       27
       10
       2
       NaN
       1
       2015-01-31
    
  

5 rows × 22 columns

	mean_1	mean_2	mean_3	mean_4	mean_5	mean_6	mean_8	median_1	median_2	median_3	...	median_6	median_8	num_1	num_2	num_3	num_4	num_5	num_6	num_8	week
0	529389	1571835	3303602	3665592	3134107	NaN	2995000	549000	1013500	2881571	...	NaN	2995000	9	61	20	6	3	NaN	1	2015-01-03
1	531504	1550382	3275355	3794154	3134107	NaN	2995000	549000	989286	2429643	...	NaN	2995000	9	63	24	8	3	NaN	1	2015-01-10
2	545462	1566104	3275062	3748120	3561429	NaN	2995000	546271	986636	2703857	...	NaN	2995000	12	69	27	9	2	NaN	1	2015-01-17
3	552524	1556027	3242167	3845129	3872500	NaN	2995000	548000	987121	2851000	...	NaN	2995000	14	73	27	10	2	NaN	1	2015-01-24
4	529364	1567261	3206141	3800777	3885714	NaN	2995000	531671	984836	2846071	...	NaN	2995000	16	71	27	10	2	NaN	1	2015-01-31