In [250]:
import urllib2
#from scrapy.selector import xpath
import requests
import pandas as pd
import numpy as np
import lxml
import time
#from lxml import html

In [501]:
base_url = 'http://berlin.craigslist.de/search/apa?s='
urls = []

for i in range (0, 2):
    urls.append(base_url + str(i*100).zfill(3))
print urls


['http://berlin.craigslist.de/search/apa?s=000', 'http://berlin.craigslist.de/search/apa?s=100']

In [534]:
def crawl_urls(urls):
    print "Indexing..."
    
    all_listings = pd.DataFrame()
    
    
    for url in urls:
        print "\t>>> " + str(url)
        data = parse_listing(url)
        print data
        print "----------------------"
        frames = [all_listings, data]
        all_listings = pd.concat(frames)
        time.sleep(3)
    
    
    print all_listings.shape
    all_listings = all_listings.set_index('id')
    filename = "/Users/arne/projects/Craigslist_BER/" + str(time.strftime("%Y-%m-%d")) + "-results.csv"
    all_listings.to_csv(filename)
    return all_listings
    # export df as csv

df = crawl_urls(urls)


Indexing...
	>>> http://berlin.craigslist.de/search/apa?s=000
Error: Unknown.
None
----------------------
	>>> http://berlin.craigslist.de/search/apa?s=100
Error: Unknown.
None
----------------------
(0, 0)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-534-667d05a0377f> in <module>()
     22     # export df as csv
     23 
---> 24 df = crawl_urls(urls)

<ipython-input-534-667d05a0377f> in crawl_urls(urls)
     16 
     17     print all_listings.shape
---> 18     all_listings = all_listings.set_index('id')
     19     filename = "/Users/arne/projects/Craigslist_BER/" + str(time.strftime("%Y-%m-%d")) + "-results.csv"
     20     all_listings.to_csv(filename)

/Users/arne/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in set_index(self, keys, drop, append, inplace, verify_integrity)
   2827                 names.append(None)
   2828             else:
-> 2829                 level = frame[col].values
   2830                 names.append(col)
   2831                 if drop:

/Users/arne/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   2001             # get column
   2002             if self.columns.is_unique:
-> 2003                 return self._get_item_cache(key)
   2004 
   2005             # duplicate columns

/Users/arne/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
    665             return cache[item]
    666         except Exception:
--> 667             values = self._data.get(item)
    668             res = self._box_item_values(item, values)
    669             cache[item] = res

/Users/arne/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item)
   1653     def get(self, item):
   1654         if self.items.is_unique:
-> 1655             _, block = self._find_block(item)
   1656             return block.get(item)
   1657         else:

/Users/arne/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in _find_block(self, item)
   1933 
   1934     def _find_block(self, item):
-> 1935         self._check_have(item)
   1936         for i, block in enumerate(self.blocks):
   1937             if item in block:

/Users/arne/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in _check_have(self, item)
   1940     def _check_have(self, item):
   1941         if item not in self.items:
-> 1942             raise KeyError('no item named %s' % com.pprint_thing(item))
   1943 
   1944     def reindex_axis(self, new_axis, method=None, axis=0, copy=True):

KeyError: u'no item named id'


In [533]:
def parse_listing(url):
    lim = 6 #handbrake
    req = requests.get(url).text
    tree = lxml.html.fromstring(req)
    resp = tree.xpath(".//p")
    apartment = {}
    dfcols = ['id', 'url', 'date', 'price', 'rooms', 'loc']
    
    df = pd.DataFrame()
    
    listings = []
    
    # For all apartment objects on the url-page
    for i in range(1, lim):

        try:
            apartment['id'] = str(resp[(i-1)].attrib['data-pid']).lower()
            apartment['url'] = "http://berlin.craigslist.de/apa/" + apartment['id'] + ".html"
            apartment['date'] = tree.xpath('//*[@id="searchform"]/div[2]/div[3]/p[' + str(i) + ']/span/span[2]/time/text()')
            apartment['price'] = tree.xpath('//*[@id="searchform"]/div[2]/div[3]/p[' + str(i) + ']/span/span[3]/span[1]/text()')
            apartment['rooms'] = tree.xpath('//*[@id="searchform"]/div[2]/div[3]/p[' + str(i) + ']/span/span[3]/span[2]/text()')
            apartment['loc'] = tree.xpath('//*[@id="searchform"]/div[2]/div[3]/p[' + str(i) + ']/span/span[3]/span[3]/small/text()')
            
            #print apartment
            #print "---------------"
            #listings.append(apartment)
            df = df.append(apartment)


        except:
            print "Error: Unknown."
            return
            


        #df = pd.DataFrame.from_records(listings)c

        #df = df.append(listings)

        
    return df

In [469]:
'''
To-do:
1. Implement wrangling for item-values
2. 


'''

df


Out[469]:
date loc price rooms url
5022861804 [Mai 13] [' (freisingerstrasse 17a, schoeneberg)'] [€670] [/ 2br - 50m, - ] http://berlin.craigslist.de/apa/5022861804.html
5006061984 [Mai 2] [' (berlin)'] [€600] [/ 2br - ] http://berlin.craigslist.de/apa/5006061984.html
4983880840 [Apr 18] [u' (berlin wedding 22 l\xfcderitz str.)'] [€300] [/ 1br - 45m, - ] http://berlin.craigslist.de/apa/4983880840.html

In [521]:
def wrangler_date(df):
    return df


def wrangler_rooms(df):
    return df


def wrangler_price(price):
    price = str(price).decode("utf-8")
    price = price.strip("[u'\u20ac").strip("']")
    price = int(price)

    return price

In [449]:
def wrangler_loc(loc):
    
    removals = ['/', '-', '(', ')', ':', " "]
    districts = ['schoeneberg', 'mitte', 'friedrichshain', 'prenzlauerberg', 'steglitz', 'pankow', 'wedding']
    loc = str(loc).lower()
    loc = loc.replace('ü', 'ue')
    loc = loc.replace('ä', 'ae')
    loc = loc.replace('ö', 'oe')
    loc = loc.replace('ß', 'ss')
    loc = loc.replace('berlin', '')

    for rem in removals:
        if rem in loc:
            loc = loc.replace(rem, '')
    
    distances = []
    for dist in districts:
        tup = dist, levenshtein_distance(dist, loc)
        distances.append(tup)
    
    f = lambda a,b: a if (a < b) else b
    top_score = reduce(f, [i[1] for i in distances])
    #print "Score: " + str(top_score)
    loc = str([i[0] for i in distances if i[1] == top_score]).strip("['").strip("']")
    if top_score < 5:
        return loc
    else:
        return

In [525]:
df2 = df.copy()
df2['loc'] = map(wrangler_loc, df['loc'])
df2['price'] = map(wrangler_price, df['price'])
df2['rooms'] = map(wrangler_rooms, df['rooms'])
df2['date'] = map(wrangler_date, df['date'])

df2[:50]


Out[525]:
date loc price rooms url
id
5029212167 [Mai 17] None 660 [/ 2br - 48m, - ] http://berlin.craigslist.de/apa/5029212167.html
5029084078 [Mai 17] None 330 [/ 2br - 63m, - ] http://berlin.craigslist.de/apa/5029084078.html
5029084078 [Mai 17] None 330 [/ 2br - 63m, - ] http://berlin.craigslist.de/apa/5029084078.html
5021449984 [Mai 17] None 600 [/ 2br - ] http://berlin.craigslist.de/apa/5021449984.html
5021449984 [Mai 17] None 600 [/ 2br - ] http://berlin.craigslist.de/apa/5021449984.html
5021449984 [Mai 17] None 600 [/ 2br - ] http://berlin.craigslist.de/apa/5021449984.html
5029061931 [Mai 17] None 850 [/ 2br - 53m, - ] http://berlin.craigslist.de/apa/5029061931.html
5029061931 [Mai 17] None 850 [/ 2br - 53m, - ] http://berlin.craigslist.de/apa/5029061931.html
5029061931 [Mai 17] None 850 [/ 2br - 53m, - ] http://berlin.craigslist.de/apa/5029061931.html
5029061931 [Mai 17] None 850 [/ 2br - 53m, - ] http://berlin.craigslist.de/apa/5029061931.html
5029050140 [Mai 17] None 60 [/ 1br - 50m, - ] http://berlin.craigslist.de/apa/5029050140.html
5029050140 [Mai 17] None 60 [/ 1br - 50m, - ] http://berlin.craigslist.de/apa/5029050140.html
5029050140 [Mai 17] None 60 [/ 1br - 50m, - ] http://berlin.craigslist.de/apa/5029050140.html
5029050140 [Mai 17] None 60 [/ 1br - 50m, - ] http://berlin.craigslist.de/apa/5029050140.html
5029050140 [Mai 17] None 60 [/ 1br - 50m, - ] http://berlin.craigslist.de/apa/5029050140.html
5012067872 [Mai 10] friedrichshain 650 [/ 40m, - ] http://berlin.craigslist.de/apa/5012067872.html
5010443169 [Mai 10] pankow 975 [/ 2br - 68m, - ] http://berlin.craigslist.de/apa/5010443169.html
5010443169 [Mai 10] pankow 975 [/ 2br - 68m, - ] http://berlin.craigslist.de/apa/5010443169.html
4995332171 [Mai 10] friedrichshain 975 [/ 1br - 74m, - ] http://berlin.craigslist.de/apa/4995332171.html
4995332171 [Mai 10] friedrichshain 975 [/ 1br - 74m, - ] http://berlin.craigslist.de/apa/4995332171.html
4995332171 [Mai 10] friedrichshain 975 [/ 1br - 74m, - ] http://berlin.craigslist.de/apa/4995332171.html
4987715908 [Mai 10] None 450 [/ 1br - ] http://berlin.craigslist.de/apa/4987715908.html
4987715908 [Mai 10] None 450 [/ 1br - ] http://berlin.craigslist.de/apa/4987715908.html
4987715908 [Mai 10] None 450 [/ 1br - ] http://berlin.craigslist.de/apa/4987715908.html
4987715908 [Mai 10] None 450 [/ 1br - ] http://berlin.craigslist.de/apa/4987715908.html
5018233578 [Mai 10] None 700 [/ 2br - 115m, - ] http://berlin.craigslist.de/apa/5018233578.html
5018233578 [Mai 10] None 700 [/ 2br - 115m, - ] http://berlin.craigslist.de/apa/5018233578.html
5018233578 [Mai 10] None 700 [/ 2br - 115m, - ] http://berlin.craigslist.de/apa/5018233578.html
5018233578 [Mai 10] None 700 [/ 2br - 115m, - ] http://berlin.craigslist.de/apa/5018233578.html
5018233578 [Mai 10] None 700 [/ 2br - 115m, - ] http://berlin.craigslist.de/apa/5018233578.html

In [519]:



Out[519]:
id
5029212167     [/ 2br - 48m,  - ]
5029084078     [/ 2br - 63m,  - ]
5029084078     [/ 2br - 63m,  - ]
5021449984             [/ 2br - ]
5021449984             [/ 2br - ]
5021449984             [/ 2br - ]
5029061931     [/ 2br - 53m,  - ]
5029061931     [/ 2br - 53m,  - ]
5029061931     [/ 2br - 53m,  - ]
5029061931     [/ 2br - 53m,  - ]
5029050140     [/ 1br - 50m,  - ]
5029050140     [/ 1br - 50m,  - ]
5029050140     [/ 1br - 50m,  - ]
5029050140     [/ 1br - 50m,  - ]
5029050140     [/ 1br - 50m,  - ]
5012067872           [/ 40m,  - ]
5010443169     [/ 2br - 68m,  - ]
5010443169     [/ 2br - 68m,  - ]
4995332171     [/ 1br - 74m,  - ]
4995332171     [/ 1br - 74m,  - ]
4995332171     [/ 1br - 74m,  - ]
4987715908             [/ 1br - ]
4987715908             [/ 1br - ]
4987715908             [/ 1br - ]
4987715908             [/ 1br - ]
5018233578    [/ 2br - 115m,  - ]
5018233578    [/ 2br - 115m,  - ]
5018233578    [/ 2br - 115m,  - ]
5018233578    [/ 2br - 115m,  - ]
5018233578    [/ 2br - 115m,  - ]
Name: rooms, dtype: object

In [374]:
def levenshtein_distance(first, second):
    """Find the Levenshtein distance between two strings."""
    if len(first) > len(second):
        first, second = second, first
    if len(second) == 0:
        return len(first)
    first_length = len(first) + 1
    second_length = len(second) + 1
    distance_matrix = [[0] * second_length for x in range(first_length)]
    for i in range(first_length):
       distance_matrix[i][0] = i
    for j in range(second_length):
       distance_matrix[0][j]=j
    for i in xrange(1, first_length):
        for j in range(1, second_length):
            deletion = distance_matrix[i-1][j] + 1
            insertion = distance_matrix[i][j-1] + 1
            substitution = distance_matrix[i-1][j-1]
            if first[i-1] != second[j-1]:
                substitution += 1
            distance_matrix[i][j] = min(insertion, deletion, substitution)
    return distance_matrix[first_length-1][second_length-1]

In [518]:
df


Out[518]:
date loc price rooms url
id
5029212167 [Mai 17] [' (berlin-spandau)'] [€660] [/ 2br - 48m, - ] http://berlin.craigslist.de/apa/5029212167.html
5029084078 [Mai 17] [' (berlin)'] [€330] [/ 2br - 63m, - ] http://berlin.craigslist.de/apa/5029084078.html
5029084078 [Mai 17] [' (berlin)'] [€330] [/ 2br - 63m, - ] http://berlin.craigslist.de/apa/5029084078.html
5021449984 [Mai 17] [' (berlin)'] [€600] [/ 2br - ] http://berlin.craigslist.de/apa/5021449984.html
5021449984 [Mai 17] [' (berlin)'] [€600] [/ 2br - ] http://berlin.craigslist.de/apa/5021449984.html
5021449984 [Mai 17] [' (berlin)'] [€600] [/ 2br - ] http://berlin.craigslist.de/apa/5021449984.html
5029061931 [Mai 17] [' (tiergarten)'] [€850] [/ 2br - 53m, - ] http://berlin.craigslist.de/apa/5029061931.html
5029061931 [Mai 17] [' (tiergarten)'] [€850] [/ 2br - 53m, - ] http://berlin.craigslist.de/apa/5029061931.html
5029061931 [Mai 17] [' (tiergarten)'] [€850] [/ 2br - 53m, - ] http://berlin.craigslist.de/apa/5029061931.html
5029061931 [Mai 17] [' (tiergarten)'] [€850] [/ 2br - 53m, - ] http://berlin.craigslist.de/apa/5029061931.html
5029050140 [Mai 17] [' (berlin)'] [€60] [/ 1br - 50m, - ] http://berlin.craigslist.de/apa/5029050140.html
5029050140 [Mai 17] [' (berlin)'] [€60] [/ 1br - 50m, - ] http://berlin.craigslist.de/apa/5029050140.html
5029050140 [Mai 17] [' (berlin)'] [€60] [/ 1br - 50m, - ] http://berlin.craigslist.de/apa/5029050140.html
5029050140 [Mai 17] [' (berlin)'] [€60] [/ 1br - 50m, - ] http://berlin.craigslist.de/apa/5029050140.html
5029050140 [Mai 17] [' (berlin)'] [€60] [/ 1br - 50m, - ] http://berlin.craigslist.de/apa/5029050140.html
5012067872 [Mai 10] [' (friedrichshain)'] [€650] [/ 40m, - ] http://berlin.craigslist.de/apa/5012067872.html
5010443169 [Mai 10] [' (pankow)'] [€975] [/ 2br - 68m, - ] http://berlin.craigslist.de/apa/5010443169.html
5010443169 [Mai 10] [' (pankow)'] [€975] [/ 2br - 68m, - ] http://berlin.craigslist.de/apa/5010443169.html
4995332171 [Mai 10] [' (friedrichshain)'] [€975] [/ 1br - 74m, - ] http://berlin.craigslist.de/apa/4995332171.html
4995332171 [Mai 10] [' (friedrichshain)'] [€975] [/ 1br - 74m, - ] http://berlin.craigslist.de/apa/4995332171.html
4995332171 [Mai 10] [' (friedrichshain)'] [€975] [/ 1br - 74m, - ] http://berlin.craigslist.de/apa/4995332171.html
4987715908 [Mai 10] [' (berlin)'] [€450] [/ 1br - ] http://berlin.craigslist.de/apa/4987715908.html
4987715908 [Mai 10] [' (berlin)'] [€450] [/ 1br - ] http://berlin.craigslist.de/apa/4987715908.html
4987715908 [Mai 10] [' (berlin)'] [€450] [/ 1br - ] http://berlin.craigslist.de/apa/4987715908.html
4987715908 [Mai 10] [' (berlin)'] [€450] [/ 1br - ] http://berlin.craigslist.de/apa/4987715908.html
5018233578 [Mai 10] [u' (leipziger stra\xdfe)'] [€700] [/ 2br - 115m, - ] http://berlin.craigslist.de/apa/5018233578.html
5018233578 [Mai 10] [u' (leipziger stra\xdfe)'] [€700] [/ 2br - 115m, - ] http://berlin.craigslist.de/apa/5018233578.html
5018233578 [Mai 10] [u' (leipziger stra\xdfe)'] [€700] [/ 2br - 115m, - ] http://berlin.craigslist.de/apa/5018233578.html
5018233578 [Mai 10] [u' (leipziger stra\xdfe)'] [€700] [/ 2br - 115m, - ] http://berlin.craigslist.de/apa/5018233578.html
5018233578 [Mai 10] [u' (leipziger stra\xdfe)'] [€700] [/ 2br - 115m, - ] http://berlin.craigslist.de/apa/5018233578.html

In [ ]: