In [31]:
#Program to scrape craigslist and write contents of particular neighborhood searches into a CSV file

# -*- coding: utf-8 -*-

import requests
import csv
from bs4 import BeautifulSoup
import time
from time import strftime
import datetime
import re
import os

In [33]:
#Urls for scraping- one for each DC neighborhood.
CH_url = "https://washingtondc.craigslist.org/search/apa?query=Columbia%20Heights&s=0"
AM_url = "https://washingtondc.craigslist.org/search/apa?query=Adams%20Morgan&s=0"

#Retrieves year, month, day
now = datetime.datetime.now().timetuple()
year_month_day = [now.tm_year, now.tm_mon, now.tm_mday]

def scraper(url):

    #Retrieve and parse url

    r = requests.get(url)

    soup = BeautifulSoup(r.content, "lxml")
    
    #Gets the total count of postings, to be used by the counter
    #total_count = soup.find("span", {"class": "totalcount"}).contents[0]

    #Writes desired content (price, neighborhood, etc.) to a list

    price_data = []
    data1 = soup.find_all("span", {"class": "price"})
    for price in data1:
        price_data.append(price.contents[0])

    neighborhood_data = []
    data2 = soup.find_all("div", {"class": "querybox"})
    for neighborhood in data2:
        neighborhood = neighborhood.contents[3]
        neighborhood = re.sub(r'.*value="(.*)".*', r'\1', str(neighborhood))
        neighborhood_data.append(neighborhood)    
        
#    neighborhood_data = []
#    data2 = soup.find_all("span", {"class": "pnr"})
#    for neighborhood in data2:
#        neighborhood_data.append(neighborhood.contents[1].contents[0])

    
    br_data = []
    sqft_data = []
    data3 = soup.find_all("span", {"class": "housing"})
    for size in data3:
        size = size.contents[0]
        if "br" in size:
            size_br = re.sub(r'.*\D([\d]*)br.*', r'\1', str(size))
            br_data.append(size_br)
        else:
            br_data.append("NA")
        if "ft" in size:
            size_ft = re.sub(r'.*\D([\d]*)ft.*', r'\1', str(size))
            sqft_data.append(size_ft)
        else:
            sqft_data.append("NA")

#    size_data = []
#    data3 = soup.find_all("span", {"class": "housing"})
#    for size in data3:
#        size_data.append(size.contents[0])

    date_data = []
    data4 = soup.find_all("span", {"class": "pl"})
    for date in data4:
        date_data.append(date.contents[1].contents[0])

    comment_data = []
    data5 = soup.find_all("span", {"class": "pl"})
    for comment in data5:
        comment_data.append(comment.contents[3].contents[0])

    #There is much content in this subheader. Link data is only the end of the url extension,
    #so this adds the extension to the base craigslist url in order to return the full path.

    link_data = []
    data6 = soup.find_all("span", {"class": "pl"})
    for link in data6:
        baselink = url[:35]
        link = re.sub(r'.*([\d]{10}).*', baselink + r'/doc/apa/'+ r'\1' + r'.html', str(link))
        link_data.append(link)

    #Like above, except this strips away all other content to provide just the ID number

    id_data = []
    data7 = soup.find_all("span", {"class": "pl"})
    for ids in data7:
        ids = re.sub(r'.*data-id="(.*[0-9])\".*', r'\1', str(ids))
        id_data.append(ids)

    data = zip(price_data, br_data, sqft_data, date_data, comment_data, link_data, id_data)

    if os.path.exists("scraper_output.csv"):
        with open("scraper_output.csv", "ab") as output:
            csv_out = csv.writer(output)
            for row in data:
                csv_out.writerow(neighborhood_data + map(clean_any, row) + year_month_day)
    else:
        with open("scraper_output.csv", "ab") as output:
            csv_out = csv.writer(output)
            csv_out.writerow(["Neighborhood", "Price", "Bedrooms", "Sqft", "Date Posted", "Comment", "Link", "ID", "Retrieval_Year", "Retrieval_Month", "Retrieval_Day"])
            for row in data:
                csv_out.writerow(neighborhood_data + map(clean_any, row) + year_month_day)
   
    
    
#    with open("scraper_output.csv", "ab") as output:
#        csv_out = csv.writer(output)
#        if os.path.exists("scraper_output.csv"):
#            pass
#        else:
#            csv_out.writerow(["Neighborhood", "Price", "Size", "Date Posted", "Comment", "Link", "ID", "Retrieval_Year", "Retrieval_Month", "Retrieval_Day"])
#        for row in data:
#            csv_out.writerow(neighborhood_data + map(clean_any, row) + year_month_day)



#list_maker creates a list of urls for each page of the query to be used by the scraper
def list_maker(url):
    
    counter = 0
    r = requests.get(url)

    soup = BeautifulSoup(r.content, "lxml")
    
    #Gets the total count of postings, to be used with the counter
    total_count = soup.find("span", {"class": "totalcount"}).contents[0]
    
    page_number = (int(total_count) / 100) + 1
    
    #takes base url and creates a list of all pages of the query
    url_list = []
    while counter < (int(total_count)):
        url_page = url[:-1] + str(counter)
        counter += 100
        url_list.append(url_page)
        
    return url_list
    #print url_list


def clean_str(string):
    return unicode(string).encode('utf-8')


def clean_any(anything):
    return clean_str(anything) if isinstance(anything, basestring) else anything
    

for item in list_maker(AM_url):
    scraper(item)
#    time.sleep(10)

for item in list_maker(CH_url):
    scraper(item)
#    time.sleep(10)


---------------------------------------------------------------------------
ConnectionError                           Traceback (most recent call last)
<ipython-input-33-bb32ef8e251f> in <module>()
    151 
    152 for item in list_maker(CH_url):
--> 153     scraper(item)
    154 #    time.sleep(10)

<ipython-input-33-bb32ef8e251f> in scraper(url)
     11     #Retrieve and parse url
     12 
---> 13     r = requests.get(url)
     14 
     15     soup = BeautifulSoup(r.content, "lxml")

C:\Users\Mason\Anaconda\lib\site-packages\requests\api.pyc in get(url, **kwargs)
     53 
     54     kwargs.setdefault('allow_redirects', True)
---> 55     return request('get', url, **kwargs)
     56 
     57 

C:\Users\Mason\Anaconda\lib\site-packages\requests\api.pyc in request(method, url, **kwargs)
     42 
     43     session = sessions.Session()
---> 44     return session.request(method=method, url=url, **kwargs)
     45 
     46 

C:\Users\Mason\Anaconda\lib\site-packages\requests\sessions.pyc in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert)
    380             'allow_redirects': allow_redirects,
    381         }
--> 382         resp = self.send(prep, **send_kwargs)
    383 
    384         return resp

C:\Users\Mason\Anaconda\lib\site-packages\requests\sessions.pyc in send(self, request, **kwargs)
    483         start = datetime.utcnow()
    484         # Send the request
--> 485         r = adapter.send(request, **kwargs)
    486         # Total elapsed time of the request (approximately)
    487         r.elapsed = datetime.utcnow() - start

C:\Users\Mason\Anaconda\lib\site-packages\requests\adapters.pyc in send(self, request, stream, timeout, verify, cert, proxies)
    370 
    371         except MaxRetryError as e:
--> 372             raise ConnectionError(e)
    373 
    374         except _ProxyError as e:

ConnectionError: HTTPSConnectionPool(host='washingtondc.craigslist.org', port=443): Max retries exceeded with url: /search/apa?query=Columbia%20Heights&s=1500 (Caused by <class 'socket.error'>: [Errno 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond)

In [ ]:
url = "https://washingtondc.craigslist.org/search/apa?query=Adams%20Morgan&s=0"

def list_maker(url):
    
    counter = 0
    r = requests.get(url)

    soup = BeautifulSoup(r.content, "lxml")
    
    #Gets the total count of postings, to be used with the counter
    total_count = soup.find("span", {"class": "totalcount"}).contents[0]
    
    page_number = (int(total_count) / 100) + 1
    
    #takes base url and creates a list of all pages of the query
    url_list = []
    while counter < (int(total_count)):
        url_page = url[:-1] + str(counter)
        counter += 100
        url_list.append(url_page)
        
    return url_list
    print url_list

for item in list_maker(url):
    print item

In [ ]: