In [ ]:

    
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs4
import requests
%pylab inline

Scraper 1

Fast craigslist scraper. Only gets price, size, title, city



In [2]:

    
url_base = 'http://sfbay.craigslist.org/search/eby/apa'
params = dict(search_distance=4, postal=94720)
rsp = requests.get(url_base, params=params)
html = bs4(rsp.text, 'html.parser')
apts = html.find_all('p', attrs={'class': 'row'})









    



http://sfbay.craigslist.org/search/eby/apa?postal=94720&search_distance=4



In [8]:

    
import time
cl_data = []
for i in [0,100,200,300,400,500,600,700,800,900,1000,1100]:
    params = dict(search_distance=4, postal=94720,s=i)
    rsp = requests.get(url_base, params=params)
    html = bs4(rsp.text, 'html.parser')
    apts = html.find_all('p', attrs={'class': 'row'})
    for apt in apts:
        url = "https://sfbay.craigslist.org" + apt.find('a', attrs={'class': 'hdrlnk'})['href']
        try:
            size = apt.findAll(attrs={'class': 'housing'})[0].text
        except IndexError:
            size = "Not Listed"
        title = apt.find('a',attrs={'class': 'hdrlnk'}).text
        try:
            price = apt.findAll(attrs={'class': 'price'})[0].text
        except IndexError:
            price = "Not Listed"
        location = apt.findAll(attrs={'class': 'pnr'})[0].text
        #print url,size,title,price,location
        cl_string = url + "," + size + "," + title + "," + price + "," + location + "\n"
        cl_data.append(cl_string)
    time.sleep(5)



In [10]:

    
f1=open('cl.csv', 'w+')
f1.write('url,size,title,price,location\n')
for data in cl_data:
    try:
        f1.write(data)
    except:
        pass
f1.close()

print "done"









    



done

Scraper 2

More thorough, grabs size, price, city, lat/long, features, open house, images



In [131]:

    
import time, json
cl_data = []
for i in [400,500,600,700,800,900,1000]:
    time.sleep(3)
    url_base = 'http://sfbay.craigslist.org/search/eby/apa'
    params = dict(search_distance=4, postal=94720,s=i)
    rsp = requests.get(url_base, params=params)
    html = bs4(rsp.text, 'html.parser')
    apts = html.find_all('p', attrs={'class': 'row'})
    #for apt in apts:
    data = {}
    for apt in apts:
        time.sleep(1)
        url = "https://sfbay.craigslist.org" + apt.find('a', attrs={'class': 'hdrlnk'})['href']
        r = urllib.urlopen(url).read()
        soup = bs4(r)
        final_dict = {}
        title = soup.findAll("span", {"id": "titletextonly"})[0].text
        try:
            size = soup.find("span", {"class": "housing"}).text
        except:
            size = "n/a"
        try:
            price = soup.findAll("span", {"class": "price"})[0].text
        except:
            price = "n/a"
        try:
            city = soup.findAll("small")[0].text
        except:
            city = "n/a"
        try:
            longitude =  soup.findAll("div", {"class": "viewposting"})[0]['data-longitude']
            latitude = soup.findAll("div", {"class": "viewposting"})[0]['data-latitude']
        except:
            longitude = "n/a"
            latitude = "n/a"
        try:
            features = soup.find(id='postingbody').text
        except:
            features = "n/a"
        try:
            open_house = soup.find("span", {"class": "otherpostings"}).text
        except:
            open_house = "n/a"
        images = []
        gmap = "n/a"
        for a in soup.find_all('a', href=True):
            if "images.craigslist.org" in a['href']:
                images.append(a['href'])
            if "maps.google.com" in a['href']:
                gmap = a['href']
        final_dict['title'] = title
        final_dict['price'] = price
        final_dict['city'] = city
        final_dict['longitude'] = longitude
        final_dict['latitude'] = latitude
        final_dict['features'] = features
        final_dict['open_house'] = open_house
        final_dict['images'] = images
        final_dict['gmap'] = gmap
        final_dict['size'] = size

        data[url] = final_dict
    
    filename = "data" + str(i) + ".json"
    with open(filename, 'w') as outfile:
        json.dump(data, outfile)



In [ ]: