In [ ]:
# Property values by zip code (Zillow and Trulia) and business pattern data (US census)
In [3]:
from random import randint
import pandas as pd
from pandas import DataFrame
import requests
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import folium
import csv
from datetime import datetime # one part of the main module
In [99]:
"""
Zillow does not provide history data: can't use it
"""
# zwsid = 'X1-ZWz1ewzc1aj37v_65xu9'
# state = 'WA'
# city = 'Seattle'
# neighborhood = 'Ballard'
# #query_url = 'http://www.zillow.com/webservice/GetDemographics.htm?zws-id=%s&state=%s&city=%s&neighborhood=%s' % (zwsid,state,city,neighborhood)
# query_url = 'http://www.zillow.com/webservice/GetDeepComps.htm?zws-id=%s&zpid=48749425&count=5' % (zwsid)
# r = requests.get(query_url)
In [ ]:
"""
So we're going to use Trulia.
"""
trulia_api_key = ''
query_url = 'http://api.trulia.com/webservices.php?library=TruliaStats&function=getCityStats&city=New York&state=NY&startDate=2007-02-10&endDate=2009-02-07&apikey=%s' % (trulia_api_key)
r = requests.get(query_url)
In [20]:
""" GET SF ZIP CODES from http://www.city-data.com/zipmaps/San-Francisco-California.html
"""
import itertools
sf_zip_codes = [94102, 94103, 94104, 94105, 94107, 94108, 94109, 94110, 94111, 94112, 94114, 94115, 94116, 94117, 94118, 94121, 94122, 94123, 94124, 94127, 94129, 94131, 94132, 94133, 94134, 94158]
In [364]:
""" Geopy has zip code converter! """
from geopy.geocoders import Nominatim
geolocator = Nominatim()
location = geolocator.geocode("78704")
print 'EXAMPLE:'
print(location.address)
print((location.latitude, location.longitude))
In [369]:
"""
But something is wrong.
"""
location = geolocator.geocode(sf_zip_codes[0])
print 'EXAMPLE:'
print(location.address)
print((location.latitude, location.longitude))
In [4]:
"""
So we're using Google Geocode API.
"""
GOOGLE_KEY = ''
query_url = 'https://maps.googleapis.com/maps/api/geocode/json?address=94102&key=%s' % (GOOGLE_KEY)
r = requests.get(query_url)
In [375]:
r.json()
Out[375]:
In [399]:
"""
Get coordinates.
"""
temp = r.json()
temp_ = temp['results'][0]['geometry']['location']
temp_
Out[399]:
In [22]:
lats = []
lngs = []
for sf_zip_code in sf_zip_codes:
query_url = 'https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=%s' % (str(sf_zip_code),GOOGLE_KEY)
r = requests.get(query_url)
temp = r.json()
lat = temp['results'][0]['geometry']['location']['lat']
lng = temp['results'][0]['geometry']['location']['lng']
lats.append(lat)
lngs.append(lng)
In [24]:
import folium
m = folium.Map(location=[37.7786871, -122.4212424],zoom_start=13)
m.circle_marker(location=[37.7786871, -122.4212424],radius=100)
for i in range(len(sf_zip_codes)):
m.circle_marker(location=[lats[i], lngs[i]], radius=500, #100 seems good enough for now
popup=str(sf_zip_codes[i]), line_color = "#980043",
fill_color="#980043", fill_opacity=.2)
m.create_map(path='sf_zip_code_map.html')
Check US census for the data. It can be downloaded as csv format.
In [51]:
# business type
df = pd.read_csv('zbp13detail.txt')
df.head()
Out[51]:
In [37]:
sf_zip_codes = [94102, 94103, 94104, 94105, 94107, 94108, 94109, 94110, 94111, 94112, 94114, 94115, 94116, 94117, 94118, 94121, 94122, 94123, 94124, 94127, 94129, 94131, 94132, 94133, 94134, 94158]
oak_zip_codes = [94601, 94602, 94603, 94605, 94606, 94607, 94610, 94611, 94612, 94613, 94621]
bay_zip_codes = sf_zip_codes + oak_zip_codes
# save zipcode file
import csv
myfile = open('bay_zip_codes.csv', 'wb')
wr = csv.writer(myfile)
wr.writerow(bay_zip_codes)
In [48]:
# load zipcode file
with open('bay_zip_codes.csv', 'rb') as f:
reader = csv.reader(f)
bay_zip_codes = list(reader)[0]
# convert str list to int list
bay_zip_codes = map(int, bay_zip_codes)
In [50]:
df_sf_oak = df.loc[df['zip'].isin(bay_zip_codes)]
In [52]:
# save as a file
df_sf_oak.to_csv('ZCBT_sf_oak_2013.csv',encoding='utf-8',index=False)
# sf1.sort(columns='est',ascending=False)
In [55]:
df_sf_oak.tail()
Out[55]:
In [431]:
# let's compare to EPA
epa = b.loc[b['zip'] == 94303]
epa.sort(columns='est',ascending=False)
Out[431]:
In [353]:
import trulia.stats as trustat
import trulia.location as truloc
zip_code_stats = trulia.stats.TruliaStats(TRULIA_KEY).get_zip_code_stats(zip_code='90025', start_date='2014-01-01', end_date='2014-01-31')
In [354]:
temp = zip_code_stats['listingStats']['listingStat']
df = DataFrame(temp)
df.head()
Out[354]:
In [355]:
def func(x,key):
k = x['subcategory'][0][key] # here I read key values
return pd.Series(k)
In [356]:
df['numProperties']=df['listingPrice'].apply((lambda x: func(x,'numberOfProperties')))
df['medPrice']=df['listingPrice'].apply((lambda x: func(x,'medianListingPrice')))
df['avrPrice']=df['listingPrice'].apply((lambda x: func(x,'averageListingPrice')))
df = df.drop('listingPrice',1)
df.head()
Out[356]:
In [357]:
"""
Get neighborhoods
"""
neighborhoods = trulia.location.LocationInfo(TRULIA_KEY).get_neighborhoods_in_city('San Francisco', 'CA')
In [358]:
neighborhoods
Out[358]:
In [359]:
""" Trulia does not provide coordinates."""
Alamo_Square = neighborhoods[0]
Alamo_Square
Out[359]:
In [ ]:
neighborhood_stats = trustat.TruliaStats(TRULIA_KEY).get_neighborhood_stats(neighborhood_id=7183, start_date='2012-01-01', end_date='2012-06-30')
In [229]:
neighborhood_stats.keys()
In [230]:
neighborhood_stats['listingStats'].keys()
Out[230]:
In [236]:
a = neighborhood_stats['listingStats']['listingStat']
b = DataFrame(a)
b.head()
Out[236]:
In [221]:
# Let's focus on All properties
x = b['listingPrice'][0]
x['subcategory'][0]
Out[221]:
In [222]:
x['subcategory'][0]['type']
Out[222]:
In [237]:
b['numProperties']=b['listingPrice'].apply((lambda x: func(x,'numberOfProperties')))
b['medPrice']=b['listingPrice'].apply((lambda x: func(x,'medianListingPrice')))
b['avrPrice']=b['listingPrice'].apply((lambda x: func(x,'averageListingPrice')))
In [238]:
b.drop('listingPrice',1)
Out[238]:
In [259]:
matplotlib.dates.date2num(a)
Out[259]:
In [262]:
date_list=[]
for date in b['weekEndingDate']:
date_list.append(datetime.strptime(date,'%Y-%m-%d'))
#a = datetime.strptime(b['weekEndingDate'],'%Y-%m-%d')
In [271]:
# plot time vs. value
dates = matplotlib.dates.date2num(date_list)
fig, ax = plt.subplots()
ax.plot_date(dates, b.medPrice,'-')
Out[271]:
In [ ]: