Collect photos from Flickr based on zip code coordinates and clean data


In [1]:
import flickrapi
import requests
import datetime
from time import time, mktime
import csv
import pandas as pd

In [2]:
api_key = ''
api_secret = ''
flickr = flickrapi.FlickrAPI(api_key, api_secret, format='parsed-json')

In [3]:
# load the file 
import pickle
with open('zipcode_final.txt', 'rb') as f:
    zip_codes = pickle.load(f)

1. Get lat-lng coordinates by zipcode (Google Maps API)


In [39]:
# Google Maps API
GOOGLE_KEY = ''

# Convert zip codes to coordinates
lats2 = []
lngs2 = []
for zip_code in zip_codes:
    query_url = 'https://maps.googleapis.com/maps/api/geocode/json?address=%s&key=%s' % (zip_code, GOOGLE_KEY)
    r = requests.get(query_url)    
    temp = r.json()
    if len(temp['results'])==0:
        lat = 'none'
        lng = 'none'    
    else:
        lat = temp['results'][0]['geometry']['location']['lat']
        lng = temp['results'][0]['geometry']['location']['lng']
    lats2.append(lat)
    lngs2.append(lng)

1.1 Use this table instead (*Google Maps API has rate limit)


In [5]:
# zip code -> coordinate file
df_ = pd.read_csv('zip_coordinate.csv')
df_.head()


Out[5]:
ZIP LAT LNG
0 601 18.180555 -66.749961
1 602 18.361945 -67.175597
2 603 18.455183 -67.119887
3 606 18.158345 -66.932911
4 610 18.295366 -67.125135

In [7]:
# Convert zip codes to coordinates
lats = []
lngs = []
for zip_code in zip_codes:
    if int(zip_code) in df_.ZIP.values:
        temp = df_[df_.ZIP == int(zip_code)]
        lats.append(temp.LAT)
        lngs.append(temp.LNG)

2. Get # of art photos per year by zip code (< 1km)


In [158]:
import datetime
from time import time, mktime
years = range(2010,2013)

df_all = pd.DataFrame()
df_art = pd.DataFrame()

# for year in years:
year = 2011
start_date = datetime.datetime(year, 1, 1, 0, 0)
start_stamp = int(mktime(start_date.timetuple()))
end_date = datetime.datetime(year+1, 1, 1, 0, 0)
end_stamp = int(mktime(end_date.timetuple()))

for (lat, lng, zip_code) in zip(lats, lngs, zip_codes):
    # all photos
    r_all = flickr.photos_search(sort = 'relevance', safe_search=1, lat=lat, lon=lng, radius=1, min_upload_date=start_stamp, max_upload_date=end_stamp)
    # art photo
    r_art = flickr.photos_search(text=['art'], tags_mode='any',sort = 'relevance', safe_search=1, lat=lat, lon=lng, radius=1, min_upload_date=start_stamp, max_upload_date=end_stamp)

    data_all = [dict(total = r_all['photos']['total'],
                year = year,
                zip_code = zip_code,
                lat = lat,
                lng = lng)]        

    data_art = [dict(total = r_art['photos']['total'],
                year = year,
                zip_code = zip_code,
                lat = lat,
                lng = lng)]        

    df_all_temp = pd.DataFrame(data_all)
    df_art_temp = pd.DataFrame(data_art)

    df_all = pd.concat([df_all,df_all_temp],ignore_index=True)
    df_art = pd.concat([df_art,df_art_temp],ignore_index=True)

In [159]:
df_all_2011 = df_all
df_art_2011 = df_art

3. Concatenate and clean


In [160]:
df_alls = pd.concat([df_all_2011,df_all_2012,df_all_2013],ignore_index=True)
df_arts = pd.concat([df_art_2011,df_art_2012,df_art_2013],ignore_index=True)

In [167]:
df_alls.head()


Out[167]:
lat lng total year zip_code
0 6560 39.294832 Name: LAT, dtype: float64 6560 -76.622229 Name: LNG, dtype: float64 17430 2011 21201
1 6561 39.296526 Name: LAT, dtype: float64 6561 -76.607016 Name: LNG, dtype: float64 16609 2011 21202
2 6563 39.30229 Name: LAT, dtype: float64 6563 -76.564482 Name: LNG, dtype: float64 139 2011 21205
3 6564 39.338428 Name: LAT, dtype: float64 6564 -76.538877 Name: LNG, dtype: float64 2 2011 21206
4 6565 39.324167 Name: LAT, dtype: float64 6565 -76.719484 Name: LNG, dtype: float64 7 2011 21207

In [171]:
df_alls['latitude']=df_alls['lat'].apply(lambda x: float(x.values))
df_alls['longitude']=df_alls['lng'].apply(lambda x: float(x.values))
df_arts['latitude']=df_arts['lat'].apply(lambda x: float(x.values))
df_arts['longitude']=df_arts['lng'].apply(lambda x: float(x.values))

In [177]:
df_alls = df_alls.drop('lat',axis=1)
df_alls = df_alls.drop('lng',axis=1)

In [183]:
df_arts = df_arts.drop('lat',axis=1)
df_arts = df_arts.drop('lng',axis=1)

In [180]:
df_alls.head()


Out[180]:
total year zip_code latitude longitude
0 17430 2011 21201 39.294832 -76.622229
1 16609 2011 21202 39.296526 -76.607016
2 139 2011 21205 39.302290 -76.564482
3 2 2011 21206 39.338428 -76.538877
4 7 2011 21207 39.324167 -76.719484

In [184]:
df_arts.head()


Out[184]:
total year zip_code latitude longitude
0 893 2011 21201 39.294832 -76.622229
1 931 2011 21202 39.296526 -76.607016
2 1 2011 21205 39.302290 -76.564482
3 0 2011 21206 39.338428 -76.538877
4 0 2011 21207 39.324167 -76.719484

4. Feature: "art" photo growth rate


In [198]:
# groupby -> elementwise computation
# year: int, zip_code: str
gb = df_arts.groupby(('year','zip_code'))

In [217]:
ref_year = 2011
growth = (int(gb.get_group((ref_year+1,zip_code))['total'].values[0])+.0)/int(gb.get_group((ref_year,zip_code))['total'].values[0])    
print growth


1.06830907055

In [253]:
ref_years = [2011, 2012]
df_art_growth = pd.DataFrame()
for ref_year in ref_years:
    for zip_code in zip_codes:
        try:
            prev_year = int(gb.get_group((ref_year,zip_code))['total'].values[0])
            this_year = int(gb.get_group((ref_year+1,zip_code))['total'].values[0]) 
            if prev_year == 0 & this_year > 0:
                growth = 2
            elif prev_year == 0 & this_year == 0:
                growth = 0
            else:
                growth = (this_year - prev_year+.0)/prev_year
            data = [dict(growth = growth, year = ref_year, zip_code = zip_code)]
            temp = pd.DataFrame(data)
            df_art_growth = pd.concat([df_art_growth,temp],ignore_index=True)
        except KeyError:
            continue

In [254]:
df_art_growth.head()


Out[254]:
growth year zip_code
0 0.068309 2011 21201
1 -0.021482 2011 21202
2 0.000000 2011 21205
3 0.000000 2011 21206
4 0.000000 2011 21207

In [255]:
df_art_growth.to_csv('art_rate.csv',encoding='utf-8',index=False)

other tags


In [14]:
import datetime
from time import time, mktime
years = range(2011,2014)


# for year in years:
start_date = datetime.datetime(year, 1, 1, 0, 0)
start_stamp = int(mktime(start_date.timetuple()))
end_date = datetime.datetime(year+1, 1, 1, 0, 0)
end_stamp = int(mktime(end_date.timetuple()))

df_art = pd.DataFrame()
for year in years:
    for (lat, lng, zip_code) in zip(lats, lngs, zip_codes):
        # all photos
        r_art = flickr.photos_search(text=['artist'], tags_mode='any',sort = 'relevance', safe_search=1, lat=lat, lon=lng, radius=2, min_upload_date=start_stamp, max_upload_date=end_stamp)
        data_art = [dict(total = r_art['photos']['total'],
                    year = year,
                    zip_code = zip_code,
                    lat = lat,
                    lng = lng)]        

        df_art_temp = pd.DataFrame(data_art)
        df_art = pd.concat([df_art,df_art_temp],ignore_index=True)

In [11]:
df_art.to_csv('flickr_hip.csv',encoding='utf-8',index=False)

In [13]:
df_art


Out[13]:
lat lng total year zip_code
0 6560 39.294832 Name: LAT, dtype: float64 6560 -76.622229 Name: LNG, dtype: float64 2 2011 21201
1 6561 39.296526 Name: LAT, dtype: float64 6561 -76.607016 Name: LNG, dtype: float64 2 2011 21202
2 6563 39.30229 Name: LAT, dtype: float64 6563 -76.564482 Name: LNG, dtype: float64 0 2011 21205
3 6564 39.338428 Name: LAT, dtype: float64 6564 -76.538877 Name: LNG, dtype: float64 0 2011 21206
4 6565 39.324167 Name: LAT, dtype: float64 6565 -76.719484 Name: LNG, dtype: float64 0 2011 21207
5 6566 39.381174 Name: LAT, dtype: float64 6566 -76.721002 Name: LNG, dtype: float64 0 2011 21208
6 6567 39.373191 Name: LAT, dtype: float64 6567 -76.670003 Name: LNG, dtype: float64 0 2011 21209
7 6568 39.359156 Name: LAT, dtype: float64 6568 -76.632685 Name: LNG, dtype: float64 1 2011 21210
8 6569 39.329817 Name: LAT, dtype: float64 6569 -76.639408 Name: LNG, dtype: float64 0 2011 21211
9 6570 39.368561 Name: LAT, dtype: float64 6570 -76.614898 Name: LNG, dtype: float64 1 2011 21212
10 6571 39.315031 Name: LAT, dtype: float64 6571 -76.577429 Name: LNG, dtype: float64 0 2011 21213
11 6572 39.351793 Name: LAT, dtype: float64 6572 -76.5644 Name: LNG, dtype: float64 0 2011 21214
12 6573 39.345241 Name: LAT, dtype: float64 6573 -76.683566 Name: LNG, dtype: float64 0 2011 21215
13 6574 39.310595 Name: LAT, dtype: float64 6574 -76.671717 Name: LNG, dtype: float64 0 2011 21216
14 6575 39.308473 Name: LAT, dtype: float64 6575 -76.639154 Name: LNG, dtype: float64 1 2011 21217
15 6576 39.330107 Name: LAT, dtype: float64 6576 -76.601451 Name: LNG, dtype: float64 0 2011 21218
16 6580 39.26484 Name: LAT, dtype: float64 6580 -76.492566 Name: LNG, dtype: float64 0 2011 21222
17 6581 39.28283 Name: LAT, dtype: float64 6581 -76.654 Name: LNG, dtype: float64 0 2011 21223
18 6582 39.27486 Name: LAT, dtype: float64 6582 -76.542833 Name: LNG, dtype: float64 0 2011 21224
19 6583 39.226117 Name: LAT, dtype: float64 6583 -76.615735 Name: LNG, dtype: float64 0 2011 21225
20 6584 39.208888 Name: LAT, dtype: float64 6584 -76.562926 Name: LNG, dtype: float64 0 2011 21226
21 6588 39.26613 Name: LAT, dtype: float64 6588 -76.623803 Name: LNG, dtype: float64 3 2011 21230
22 6589 39.2872 Name: LAT, dtype: float64 6589 -76.591953 Name: LNG, dtype: float64 2 2011 21231
23 6590 39.393417 Name: LAT, dtype: float64 6590 -76.534228 Name: LNG, dtype: float64 1 2011 21234
24 6591 39.388421 Name: LAT, dtype: float64 6591 -76.48355 Name: LNG, dtype: float64 0 2011 21236
25 6592 39.341939 Name: LAT, dtype: float64 6592 -76.495443 Name: LNG, dtype: float64 0 2011 21237
26 6593 39.367099 Name: LAT, dtype: float64 6593 -76.589171 Name: LNG, dtype: float64 2 2011 21239
27 6597 39.344707 Name: LAT, dtype: float64 6597 -76.581242 Name: LNG, dtype: float64 0 2011 21251
28 473 42.357768 Name: LAT, dtype: float64 473 -71.064858 Name: LNG, dtype: float64 6 2011 02108
29 474 42.367032 Name: LAT, dtype: float64 474 -71.050493 Name: LNG, dtype: float64 5 2011 02109
... ... ... ... ... ...
1731 6250 38.900126 Name: LAT, dtype: float64 6250 -77.046981 Name: LNG, dtype: float64 11 2013 20052
1732 6251 38.884122 Name: LAT, dtype: float64 6251 -77.011224 Name: LNG, dtype: float64 32 2013 20053
1733 6252 38.90912 Name: LAT, dtype: float64 6252 -77.075735 Name: LNG, dtype: float64 5 2013 20057
1734 6253 38.936354 Name: LAT, dtype: float64 6253 -76.999167 Name: LNG, dtype: float64 4 2013 20064
1735 6301 38.887071 Name: LAT, dtype: float64 6301 -77.02101 Name: LNG, dtype: float64 35 2013 20202
1736 6302 38.885559 Name: LAT, dtype: float64 6302 -77.014429 Name: LNG, dtype: float64 33 2013 20204
1737 6303 38.886412 Name: LAT, dtype: float64 6303 -77.030282 Name: LNG, dtype: float64 33 2013 20228
1738 6304 38.893794 Name: LAT, dtype: float64 6304 -77.032798 Name: LNG, dtype: float64 35 2013 20230
1739 6305 38.894456 Name: LAT, dtype: float64 6305 -77.042605 Name: LNG, dtype: float64 10 2013 20240
1740 6306 38.89334 Name: LAT, dtype: float64 6306 -77.04446 Name: LNG, dtype: float64 11 2013 20245
1741 6307 38.883669 Name: LAT, dtype: float64 6307 -77.025035 Name: LNG, dtype: float64 33 2013 20260
1742 6308 38.975685 Name: LAT, dtype: float64 6308 -77.030133 Name: LNG, dtype: float64 0 2013 20307
1743 6309 38.934841 Name: LAT, dtype: float64 6309 -77.014387 Name: LNG, dtype: float64 4 2013 20317
1744 6310 38.864838 Name: LAT, dtype: float64 6310 -77.017003 Name: LNG, dtype: float64 27 2013 20319
1745 6311 38.858625 Name: LAT, dtype: float64 6311 -77.007865 Name: LNG, dtype: float64 25 2013 20373
1746 6312 38.879019 Name: LAT, dtype: float64 6312 -76.993695 Name: LNG, dtype: float64 27 2013 20390
1747 6313 38.896377 Name: LAT, dtype: float64 6313 -77.042588 Name: LNG, dtype: float64 10 2013 20405
1748 6314 38.8928 Name: LAT, dtype: float64 6314 -77.047764 Name: LNG, dtype: float64 11 2013 20418
1749 6315 38.902057 Name: LAT, dtype: float64 6315 -77.047558 Name: LNG, dtype: float64 13 2013 20427
1750 6316 38.897071 Name: LAT, dtype: float64 6316 -77.038728 Name: LNG, dtype: float64 11 2013 20506
1751 6317 38.89278 Name: LAT, dtype: float64 6317 -77.00689 Name: LNG, dtype: float64 34 2013 20510
1752 6318 38.894759 Name: LAT, dtype: float64 6318 -77.048407 Name: LNG, dtype: float64 11 2013 20520
1753 6319 38.894467 Name: LAT, dtype: float64 6319 -77.024844 Name: LNG, dtype: float64 37 2013 20535
1754 6320 38.887929 Name: LAT, dtype: float64 6320 -77.004713 Name: LNG, dtype: float64 33 2013 20540
1755 6321 38.892802 Name: LAT, dtype: float64 6321 -77.0458 Name: LNG, dtype: float64 11 2013 20551
1756 6322 38.886946 Name: LAT, dtype: float64 6322 -77.022968 Name: LNG, dtype: float64 34 2013 20553
1757 6323 38.888232 Name: LAT, dtype: float64 6323 -77.026003 Name: LNG, dtype: float64 33 2013 20560
1758 6324 38.89063 Name: LAT, dtype: float64 6324 -77.019211 Name: LNG, dtype: float64 36 2013 20565
1759 6325 38.89557 Name: LAT, dtype: float64 6325 -77.055128 Name: LNG, dtype: float64 11 2013 20566
1760 6326 38.866713 Name: LAT, dtype: float64 6326 -77.010187 Name: LNG, dtype: float64 27 2013 20593

1761 rows × 5 columns