In [250]:
import urllib2
#from scrapy.selector import xpath
import requests
import pandas as pd
import numpy as np
import lxml
import time
#from lxml import html
In [501]:
base_url = 'http://berlin.craigslist.de/search/apa?s='
urls = []
for i in range (0, 2):
urls.append(base_url + str(i*100).zfill(3))
print urls
In [534]:
def crawl_urls(urls):
print "Indexing..."
all_listings = pd.DataFrame()
for url in urls:
print "\t>>> " + str(url)
data = parse_listing(url)
print data
print "----------------------"
frames = [all_listings, data]
all_listings = pd.concat(frames)
time.sleep(3)
print all_listings.shape
all_listings = all_listings.set_index('id')
filename = "/Users/arne/projects/Craigslist_BER/" + str(time.strftime("%Y-%m-%d")) + "-results.csv"
all_listings.to_csv(filename)
return all_listings
# export df as csv
df = crawl_urls(urls)
In [533]:
def parse_listing(url):
lim = 6 #handbrake
req = requests.get(url).text
tree = lxml.html.fromstring(req)
resp = tree.xpath(".//p")
apartment = {}
dfcols = ['id', 'url', 'date', 'price', 'rooms', 'loc']
df = pd.DataFrame()
listings = []
# For all apartment objects on the url-page
for i in range(1, lim):
try:
apartment['id'] = str(resp[(i-1)].attrib['data-pid']).lower()
apartment['url'] = "http://berlin.craigslist.de/apa/" + apartment['id'] + ".html"
apartment['date'] = tree.xpath('//*[@id="searchform"]/div[2]/div[3]/p[' + str(i) + ']/span/span[2]/time/text()')
apartment['price'] = tree.xpath('//*[@id="searchform"]/div[2]/div[3]/p[' + str(i) + ']/span/span[3]/span[1]/text()')
apartment['rooms'] = tree.xpath('//*[@id="searchform"]/div[2]/div[3]/p[' + str(i) + ']/span/span[3]/span[2]/text()')
apartment['loc'] = tree.xpath('//*[@id="searchform"]/div[2]/div[3]/p[' + str(i) + ']/span/span[3]/span[3]/small/text()')
#print apartment
#print "---------------"
#listings.append(apartment)
df = df.append(apartment)
except:
print "Error: Unknown."
return
#df = pd.DataFrame.from_records(listings)c
#df = df.append(listings)
return df
In [469]:
'''
To-do:
1. Implement wrangling for item-values
2.
'''
df
Out[469]:
In [521]:
def wrangler_date(df):
return df
def wrangler_rooms(df):
return df
def wrangler_price(price):
price = str(price).decode("utf-8")
price = price.strip("[u'\u20ac").strip("']")
price = int(price)
return price
In [449]:
def wrangler_loc(loc):
removals = ['/', '-', '(', ')', ':', " "]
districts = ['schoeneberg', 'mitte', 'friedrichshain', 'prenzlauerberg', 'steglitz', 'pankow', 'wedding']
loc = str(loc).lower()
loc = loc.replace('ü', 'ue')
loc = loc.replace('ä', 'ae')
loc = loc.replace('ö', 'oe')
loc = loc.replace('ß', 'ss')
loc = loc.replace('berlin', '')
for rem in removals:
if rem in loc:
loc = loc.replace(rem, '')
distances = []
for dist in districts:
tup = dist, levenshtein_distance(dist, loc)
distances.append(tup)
f = lambda a,b: a if (a < b) else b
top_score = reduce(f, [i[1] for i in distances])
#print "Score: " + str(top_score)
loc = str([i[0] for i in distances if i[1] == top_score]).strip("['").strip("']")
if top_score < 5:
return loc
else:
return
In [525]:
df2 = df.copy()
df2['loc'] = map(wrangler_loc, df['loc'])
df2['price'] = map(wrangler_price, df['price'])
df2['rooms'] = map(wrangler_rooms, df['rooms'])
df2['date'] = map(wrangler_date, df['date'])
df2[:50]
Out[525]:
In [519]:
Out[519]:
In [374]:
def levenshtein_distance(first, second):
"""Find the Levenshtein distance between two strings."""
if len(first) > len(second):
first, second = second, first
if len(second) == 0:
return len(first)
first_length = len(first) + 1
second_length = len(second) + 1
distance_matrix = [[0] * second_length for x in range(first_length)]
for i in range(first_length):
distance_matrix[i][0] = i
for j in range(second_length):
distance_matrix[0][j]=j
for i in xrange(1, first_length):
for j in range(1, second_length):
deletion = distance_matrix[i-1][j] + 1
insertion = distance_matrix[i][j-1] + 1
substitution = distance_matrix[i-1][j-1]
if first[i-1] != second[j-1]:
substitution += 1
distance_matrix[i][j] = min(insertion, deletion, substitution)
return distance_matrix[first_length-1][second_length-1]
In [518]:
df
Out[518]:
In [ ]: