notebook.community

Edit and run



In [51]:

    
#Program to scrape craigslist and write contents of particular neighborhood searches into a CSV file

# -*- coding: utf-8 -*-

import requests
import csv
from bs4 import BeautifulSoup
import time
from time import strftime
import datetime
import re
import os



In [52]:

    
#Urls for scraping- one for each DC neighborhood.
CH_url = "https://washingtondc.craigslist.org/search/apa?query=Columbia%20Heights&s=0"
AM_url = "https://washingtondc.craigslist.org/search/apa?query=Adams%20Morgan&s=0"

#Retrieves year, month, day
now = datetime.datetime.now().timetuple()
year_month_day = (now.tm_year, now.tm_mon, now.tm_mday)

def scraper(url):

    #Retrieve and parse url

    r = requests.get(url)

    soup = BeautifulSoup(r.content, "lxml")
    
    #Gets the total count of postings, to be used by the counter
    total_count = soup.find("span", {"class": "totalcount"}).contents[0]

    #Writes desired content (price, neighborhood, etc.) to a list

    price_data = []
    data1 = soup.find_all("span", {"class": "price"})
    for price in data1:
        price_data.append(price.contents[0])

    neighborhood_data = []
    data2 = soup.find_all("span", {"class": "pnr"})
    for neighborhood in data2:
        neighborhood_data.append(neighborhood.contents[1].contents[0])

    size_data = []
    data3 = soup.find_all("span", {"class": "housing"})
    for size in data3:
        size_data.append(size.contents[0])

    date_data = []
    data4 = soup.find_all("span", {"class": "pl"})
    for date in data4:
        date_data.append(date.contents[1].contents[0])

    comment_data = []
    data5 = soup.find_all("span", {"class": "pl"})
    for comment in data5:
        comment_data.append(comment.contents[3].contents[0])

    #There is much content in this subheader. Link data is only the end of the url extension,
    #so this adds the extension to the base craigslist url in order to return the full path.

    link_data = []
    data6 = soup.find_all("span", {"class": "pl"})
    for link in data6:
        baselink = url[:35]
        link = re.sub(r'.*(/doc.*)\.html.*', baselink + r'\1' + r'.html', str(link))
        link_data.append(link)

    #Like above, except this strips away all other content to provide just the ID number

    id_data = []
    data7 = soup.find_all("span", {"class": "pl"})
    for ids in data7:
        ids = re.sub(r'.*data-id="(.*[0-9])\".*', r'\1', str(ids))
        id_data.append(ids)

    data = zip(price_data, neighborhood_data, size_data, date_data, comment_data, link_data, id_data)

    with open(str(year_month_day) + url[53:58] + "_scraper_output.csv", "ab") as output:
        csv_out = csv.writer(output)
        if os.path.exists(str(year_month_day) + url[53:58] + "_scraper_output.csv"):
            pass
        else:
            csv_out.writerow(["Price", "Neighborhood", "Size", "Date Posted", "Comment", "Link", "ID", "Retrieval_Year", "Retrieval_Month", "Retrieval_Day"])
        for row in data:
            csv_out.writerow(row + year_month_day)
            
#list_maker creates a list of urls for each page of the query to be used by the scraper
def list_maker(url):
    
    counter = 0
    r = requests.get(url)

    soup = BeautifulSoup(r.content, "lxml")
    
    #Gets the total count of postings, to be used with the counter
    total_count = soup.find("span", {"class": "totalcount"}).contents[0]
    print total_count
    
    page_number = (int(total_count) / 100) + 1
    
    print page_number
    
    #takes base url and creates a list of all pages of the query
    url_list = []
    while counter < (int(total_count)):
        url_page = url + str(counter)
        counter += 100
        url_list.append(url_page)
        
    return url_list
    #print url_list
        
for item in list_maker(CH_url):
    scraper(CH_url)
#    time.sleep(10)

for item in list_maker(AM_url):
    scraper(AM_url)
#    time.sleep(10)

scraper(AM_url)
            
"""
def counter(url):
    while (count + 100) < total_count:
        count = count + 100
    else:
        next_neighborhood
        
def scraper(results_so_far=[], start_value=0):

    page = 'dummy'
    
    if page_is_empty(page):
        return results_so_far
    else:
        return scraper(results)
"""
        
#scraper(CH_url)
#scraper(AM_url)









    



1186
12
651
7






    



---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-52-ea17f98fc034> in <module>()
    105 
    106 for item in list_maker(AM_url):
--> 107     scraper(AM_url)
    108 #    time.sleep(10)
    109 

<ipython-input-52-ea17f98fc034> in scraper(url)
     72             csv_out.writerow(["Price", "Neighborhood", "Size", "Date Posted", "Comment", "Link", "ID", "Retrieval_Year", "Retrieval_Month", "Retrieval_Day"])
     73         for row in data:
---> 74             csv_out.writerow(row + year_month_day)
     75 
     76 #list_maker creates a list of urls for each page of the query to be used by the scraper

UnicodeEncodeError: 'ascii' codec can't encode characters in position 0-2: ordinal not in range(128)



In [ ]:

    
"""
time_data = []
data8 = time_data.append([time])
"""

"""
def clean_row(dirty_row):
    stripped_row = [data.strip() for data in dirty_row]
    price_data,neighborhood_data,size_data,date_data,comment_data,link_data = stripped_row

    neighborhood_data = re.sub(r'\((.+?)\)', r'\1', neighborhood_data)
    size_data = re.sub(r'.*?([0-9])+br\s*-\s*([0-9]*).+', r'\1, \2', size_data)
    #date_data = re.sub(r'')
    #comment_data = re.sub(r'')
    baselink = url[:34]
    link_data = re.sub(r'"(/doc+.*)"', r'\1', baselink+link_data)

    return neighborhood_data


def clean(dirty_rows):
    # clean_rows = [clean(row) row for row in dirty_rows]
    clean_rows = []
    for row in dirty_rows:
        clean_rows.append(clean_row(row))
    return clean_rows

dirty = []

with open("scraper_output.csv", "rb") as f:
    reader = csv.reader(f)
    for row in reader:
        dirty.append(row)
print clean(dirty)

strftime("%Y-%m-%d %H:%M:%S")
"""



In [ ]:

    
"""
CH_url = "https://washingtondc.craigslist.org/search/apa?query=Columbia%20Heights&s=0"
AM_url = "https://washingtondc.craigslist.org/search/apa?query=Adams%20Morgan&s=0"

def list_maker(url):
    
    counter = 0
    r = requests.get(url)

    soup = BeautifulSoup(r.content, "lxml")
    
    #Gets the total count of postings, to be used by the counter
    total_count = soup.find("span", {"class": "totalcount"}).contents[0]
    print total_count
    
    page_number = (int(total_count) / 100) + 1
    
    print page_number
    
    url_list = []
    while counter < (int(total_count)):
        url_page = url + str(counter)
        counter += 100
        url_list.append(url_page)
        
    return url_list
    #print url_list
        
for item in list_maker(CH_url):
    print item
for item in list_maker(AM_url):
    print item
"""