In [51]:
#Program to scrape craigslist and write contents of particular neighborhood searches into a CSV file
# -*- coding: utf-8 -*-
import requests
import csv
from bs4 import BeautifulSoup
import time
from time import strftime
import datetime
import re
import os
In [52]:
#Urls for scraping- one for each DC neighborhood.
CH_url = "https://washingtondc.craigslist.org/search/apa?query=Columbia%20Heights&s=0"
AM_url = "https://washingtondc.craigslist.org/search/apa?query=Adams%20Morgan&s=0"
#Retrieves year, month, day
now = datetime.datetime.now().timetuple()
year_month_day = (now.tm_year, now.tm_mon, now.tm_mday)
def scraper(url):
#Retrieve and parse url
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
#Gets the total count of postings, to be used by the counter
total_count = soup.find("span", {"class": "totalcount"}).contents[0]
#Writes desired content (price, neighborhood, etc.) to a list
price_data = []
data1 = soup.find_all("span", {"class": "price"})
for price in data1:
price_data.append(price.contents[0])
neighborhood_data = []
data2 = soup.find_all("span", {"class": "pnr"})
for neighborhood in data2:
neighborhood_data.append(neighborhood.contents[1].contents[0])
size_data = []
data3 = soup.find_all("span", {"class": "housing"})
for size in data3:
size_data.append(size.contents[0])
date_data = []
data4 = soup.find_all("span", {"class": "pl"})
for date in data4:
date_data.append(date.contents[1].contents[0])
comment_data = []
data5 = soup.find_all("span", {"class": "pl"})
for comment in data5:
comment_data.append(comment.contents[3].contents[0])
#There is much content in this subheader. Link data is only the end of the url extension,
#so this adds the extension to the base craigslist url in order to return the full path.
link_data = []
data6 = soup.find_all("span", {"class": "pl"})
for link in data6:
baselink = url[:35]
link = re.sub(r'.*(/doc.*)\.html.*', baselink + r'\1' + r'.html', str(link))
link_data.append(link)
#Like above, except this strips away all other content to provide just the ID number
id_data = []
data7 = soup.find_all("span", {"class": "pl"})
for ids in data7:
ids = re.sub(r'.*data-id="(.*[0-9])\".*', r'\1', str(ids))
id_data.append(ids)
data = zip(price_data, neighborhood_data, size_data, date_data, comment_data, link_data, id_data)
with open(str(year_month_day) + url[53:58] + "_scraper_output.csv", "ab") as output:
csv_out = csv.writer(output)
if os.path.exists(str(year_month_day) + url[53:58] + "_scraper_output.csv"):
pass
else:
csv_out.writerow(["Price", "Neighborhood", "Size", "Date Posted", "Comment", "Link", "ID", "Retrieval_Year", "Retrieval_Month", "Retrieval_Day"])
for row in data:
csv_out.writerow(row + year_month_day)
#list_maker creates a list of urls for each page of the query to be used by the scraper
def list_maker(url):
counter = 0
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
#Gets the total count of postings, to be used with the counter
total_count = soup.find("span", {"class": "totalcount"}).contents[0]
print total_count
page_number = (int(total_count) / 100) + 1
print page_number
#takes base url and creates a list of all pages of the query
url_list = []
while counter < (int(total_count)):
url_page = url + str(counter)
counter += 100
url_list.append(url_page)
return url_list
#print url_list
for item in list_maker(CH_url):
scraper(CH_url)
# time.sleep(10)
for item in list_maker(AM_url):
scraper(AM_url)
# time.sleep(10)
scraper(AM_url)
"""
def counter(url):
while (count + 100) < total_count:
count = count + 100
else:
next_neighborhood
def scraper(results_so_far=[], start_value=0):
page = 'dummy'
if page_is_empty(page):
return results_so_far
else:
return scraper(results)
"""
#scraper(CH_url)
#scraper(AM_url)
In [ ]:
"""
time_data = []
data8 = time_data.append([time])
"""
"""
def clean_row(dirty_row):
stripped_row = [data.strip() for data in dirty_row]
price_data,neighborhood_data,size_data,date_data,comment_data,link_data = stripped_row
neighborhood_data = re.sub(r'\((.+?)\)', r'\1', neighborhood_data)
size_data = re.sub(r'.*?([0-9])+br\s*-\s*([0-9]*).+', r'\1, \2', size_data)
#date_data = re.sub(r'')
#comment_data = re.sub(r'')
baselink = url[:34]
link_data = re.sub(r'"(/doc+.*)"', r'\1', baselink+link_data)
return neighborhood_data
def clean(dirty_rows):
# clean_rows = [clean(row) row for row in dirty_rows]
clean_rows = []
for row in dirty_rows:
clean_rows.append(clean_row(row))
return clean_rows
dirty = []
with open("scraper_output.csv", "rb") as f:
reader = csv.reader(f)
for row in reader:
dirty.append(row)
print clean(dirty)
strftime("%Y-%m-%d %H:%M:%S")
"""
In [ ]:
"""
CH_url = "https://washingtondc.craigslist.org/search/apa?query=Columbia%20Heights&s=0"
AM_url = "https://washingtondc.craigslist.org/search/apa?query=Adams%20Morgan&s=0"
def list_maker(url):
counter = 0
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
#Gets the total count of postings, to be used by the counter
total_count = soup.find("span", {"class": "totalcount"}).contents[0]
print total_count
page_number = (int(total_count) / 100) + 1
print page_number
url_list = []
while counter < (int(total_count)):
url_page = url + str(counter)
counter += 100
url_list.append(url_page)
return url_list
#print url_list
for item in list_maker(CH_url):
print item
for item in list_maker(AM_url):
print item
"""