notebook.community

Edit and run



In [8]:

    
#Program to scrape craigslist and write contents of particular neighborhood searches into a CSV file

import requests
import csv
from bs4 import BeautifulSoup
from time import strftime
import re



In [9]:

    
#Urls for scraping- one for each DC neighborhood.
CH_url = "https://washingtondc.craigslist.org/search/apa?query=Columbia%20Heights&s=0"
AM_url = "https://washingtondc.craigslist.org/search/apa?query=Adams%20Morgan&s=0"


def scraper(url):

    #Retrieve and parse url

    r = requests.get(url)

    soup = BeautifulSoup(r.content, "lxml")

    #Writes desired content (price, neighborhood, etc.) to a list

    price_data = []
    data1 = soup.find_all("span", {"class": "price"})
    for price in data1:
        price_data.append(price.contents[0])

    neighborhood_data = []
    data2 = soup.find_all("span", {"class": "pnr"})
    for neighborhood in data2:
        neighborhood_data.append(neighborhood.contents[1].contents[0])

    size_data = []
    data3 = soup.find_all("span", {"class": "housing"})
    for size in data3:
        size_data.append(size.contents[0])

    date_data = []
    data4 = soup.find_all("span", {"class": "pl"})
    for date in data4:
        date_data.append(date.contents[1].contents[0])

    comment_data = []
    data5 = soup.find_all("span", {"class": "pl"})
    for comment in data5:
        comment_data.append(comment.contents[3].contents[0])

    #There is much content in this subheader. Link data is only the end of the url extension,
    #so this adds the extension to the base craigslist url in order to return the full path.

    link_data = []
    data6 = soup.find_all("span", {"class": "pl"})
    for link in data6:
        baselink = url[:35]
        link = re.sub(r'.*(/doc.*)\.html.*', baselink + r'\1' + r'.html', str(link))
        link_data.append(link)

    #Like above, except this strips away all other content to provide just the ID number

    id_data = []
    data7 = soup.find_all("span", {"class": "pl"})
    for ids in data7:
        ids = re.sub(r'.*data-id="(.*[0-9])\".*', r'\1', str(ids))
        id_data.append(ids)

    data = zip(price_data, neighborhood_data, size_data, date_data, comment_data, link_data, id_data)

    with open(url[53:58] + "_scraper_output.csv", "wb") as output:
        csv_out = csv.writer(output)
        csv_out.writerow(["Price", "Neighborhood", "Size", "Date Posted", "Comment", "Link", "ID"])
        for row in data:
            csv_out.writerow(row)

scraper(CH_url)
scraper(AM_url)



In [10]:

    
"""
time_data = []
data8 = time_data.append([time])
"""

"""
def clean_row(dirty_row):
    stripped_row = [data.strip() for data in dirty_row]
    price_data,neighborhood_data,size_data,date_data,comment_data,link_data = stripped_row

    neighborhood_data = re.sub(r'\((.+?)\)', r'\1', neighborhood_data)
    size_data = re.sub(r'.*?([0-9])+br\s*-\s*([0-9]*).+', r'\1, \2', size_data)
    #date_data = re.sub(r'')
    #comment_data = re.sub(r'')
    baselink = url[:34]
    link_data = re.sub(r'"(/doc+.*)"', r'\1', baselink+link_data)

    return neighborhood_data


def clean(dirty_rows):
    # clean_rows = [clean(row) row for row in dirty_rows]
    clean_rows = []
    for row in dirty_rows:
        clean_rows.append(clean_row(row))
    return clean_rows

dirty = []

with open("scraper_output.csv", "rb") as f:
    reader = csv.reader(f)
    for row in reader:
        dirty.append(row)
print clean(dirty)
"""









    Out[10]:





'\ndef clean_row(dirty_row):\n    stripped_row = [data.strip() for data in dirty_row]\n    price_data,neighborhood_data,size_data,date_data,comment_data,link_data = stripped_row\n\n    neighborhood_data = re.sub(r\'\\((.+?)\\)\', r\'\x01\', neighborhood_data)\n    size_data = re.sub(r\'.*?([0-9])+br\\s*-\\s*([0-9]*).+\', r\'\x01, \x02\', size_data)\n    #date_data = re.sub(r\'\')\n    #comment_data = re.sub(r\'\')\n    baselink = url[:34]\n    link_data = re.sub(r\'"(/doc+.*)"\', r\'\x01\', baselink+link_data)\n\n    return neighborhood_data\n\n\ndef clean(dirty_rows):\n    # clean_rows = [clean(row) row for row in dirty_rows]\n    clean_rows = []\n    for row in dirty_rows:\n        clean_rows.append(clean_row(row))\n    return clean_rows\n\ndirty = []\n\nwith open("scraper_output.csv", "rb") as f:\n    reader = csv.reader(f)\n    for row in reader:\n        dirty.append(row)\nprint clean(dirty)\n'



In [11]:

    
strftime("%Y-%m-%d %H:%M:%S")









    Out[11]:





'2015-09-13 16:08:02'