In [ ]:
import pandas as pd
import urllib
from bs4 import BeautifulSoup as bs4
import requests
%pylab inline
In [2]:
url_base = 'http://sfbay.craigslist.org/search/eby/apa'
params = dict(search_distance=4, postal=94720)
rsp = requests.get(url_base, params=params)
html = bs4(rsp.text, 'html.parser')
apts = html.find_all('p', attrs={'class': 'row'})
In [8]:
import time
cl_data = []
for i in [0,100,200,300,400,500,600,700,800,900,1000,1100]:
params = dict(search_distance=4, postal=94720,s=i)
rsp = requests.get(url_base, params=params)
html = bs4(rsp.text, 'html.parser')
apts = html.find_all('p', attrs={'class': 'row'})
for apt in apts:
url = "https://sfbay.craigslist.org" + apt.find('a', attrs={'class': 'hdrlnk'})['href']
try:
size = apt.findAll(attrs={'class': 'housing'})[0].text
except IndexError:
size = "Not Listed"
title = apt.find('a',attrs={'class': 'hdrlnk'}).text
try:
price = apt.findAll(attrs={'class': 'price'})[0].text
except IndexError:
price = "Not Listed"
location = apt.findAll(attrs={'class': 'pnr'})[0].text
#print url,size,title,price,location
cl_string = url + "," + size + "," + title + "," + price + "," + location + "\n"
cl_data.append(cl_string)
time.sleep(5)
In [10]:
f1=open('cl.csv', 'w+')
f1.write('url,size,title,price,location\n')
for data in cl_data:
try:
f1.write(data)
except:
pass
f1.close()
print "done"
In [131]:
import time, json
cl_data = []
for i in [400,500,600,700,800,900,1000]:
time.sleep(3)
url_base = 'http://sfbay.craigslist.org/search/eby/apa'
params = dict(search_distance=4, postal=94720,s=i)
rsp = requests.get(url_base, params=params)
html = bs4(rsp.text, 'html.parser')
apts = html.find_all('p', attrs={'class': 'row'})
#for apt in apts:
data = {}
for apt in apts:
time.sleep(1)
url = "https://sfbay.craigslist.org" + apt.find('a', attrs={'class': 'hdrlnk'})['href']
r = urllib.urlopen(url).read()
soup = bs4(r)
final_dict = {}
title = soup.findAll("span", {"id": "titletextonly"})[0].text
try:
size = soup.find("span", {"class": "housing"}).text
except:
size = "n/a"
try:
price = soup.findAll("span", {"class": "price"})[0].text
except:
price = "n/a"
try:
city = soup.findAll("small")[0].text
except:
city = "n/a"
try:
longitude = soup.findAll("div", {"class": "viewposting"})[0]['data-longitude']
latitude = soup.findAll("div", {"class": "viewposting"})[0]['data-latitude']
except:
longitude = "n/a"
latitude = "n/a"
try:
features = soup.find(id='postingbody').text
except:
features = "n/a"
try:
open_house = soup.find("span", {"class": "otherpostings"}).text
except:
open_house = "n/a"
images = []
gmap = "n/a"
for a in soup.find_all('a', href=True):
if "images.craigslist.org" in a['href']:
images.append(a['href'])
if "maps.google.com" in a['href']:
gmap = a['href']
final_dict['title'] = title
final_dict['price'] = price
final_dict['city'] = city
final_dict['longitude'] = longitude
final_dict['latitude'] = latitude
final_dict['features'] = features
final_dict['open_house'] = open_house
final_dict['images'] = images
final_dict['gmap'] = gmap
final_dict['size'] = size
data[url] = final_dict
filename = "data" + str(i) + ".json"
with open(filename, 'w') as outfile:
json.dump(data, outfile)
In [ ]: