dataapi



In [32]:
# Use case for NLGIS2 data analysis using data service and pandas library
# (C) Vyacheslav Tykhonov vty@iisg.nl
# International Institute of Social History 
# http://socialhistory.org

%matplotlib inline
import urllib2 
import simplejson
import json
import sys
import pandas as pd
import random
import vincent
from vincent import Axis, AxisProperties, PropertySet, ValueRef
from pandas.io.json import json_normalize

# Global settings
apiurl = "http://node-128.dev.socialhistoryservices.org/api/data"
amscodecolumn = 'amsterdam_code'
yearcolumn = 'year'

# Default values
varcode = "TXVV"
varyear = "1982"
colors = ['red', 'green', 'orange', 'brown', 'purple', 'blue', 'cyan']

def load_api_data(apiurl, code, year):
    amscode = str(code)
    jsondataurl = apiurl + "?code=" + str(code) + '&year=' + year
    
    req = urllib2.Request(jsondataurl)
    opener = urllib2.build_opener()
    f = opener.open(req)
    dataframe = simplejson.load(f)
    return dataframe

def data2frame(dataframe):
    data = dataframe['data']
    years = {}
    debug = 0
    datavalues = {}
        
    for item in data:
        amscode = item[amscodecolumn]
        year = item[yearcolumn]
        datavalues[year] = item
        if debug:
            print str(amscode) + ' ' + str(year)
            print item
    
    for year in datavalues:        
        values = datavalues[year]
        for name in values:
            if debug:
                print name + ' ' + str(values[name])
    return datavalues
    
data = load_api_data(apiurl, varcode, varyear)
# 'indicator': 'TK', 'code': 'TXCU', 'naam': 'ADORP', 'amsterdam_code': '10996', 'value': 89.0, 'year': 1937, 'id': 1, 'cbsnr': '1'
# Create DataFrame object pf and load data 
yeardf, amscodedf, naamdf, valuedf = [],[],[],[]
for amscode in data:
    values = data[amscode]
    yeardf.append(data[amscode]['year'])
    naamdf.append(data[amscode]['naam'])
    amscodedf.append(data[amscode]['amsterdam_code'])
    valuedf.append(data[amscode]['value'])
    
df = pd.DataFrame([yeardf,amscodedf,naamdf,valuedf]).T
df.columns = ['year', 'amsterdam_code', 'naam', 'value']
    #data
#json_normalize(data[0])
#dataframe = json.loads(data)
#df.read_json(data)    

# Exploring dataset to see columns and data
print df.head()
# Copy dataframe to new variable
newframe = df[['year', 'amsterdam_code', 'naam', 'value']]


   year amsterdam_code        naam value
0  1982          10298    OLDEHOVE   273
1  1982          10297   HAGESTEIN   194
2  1982          10296  NOORDELOOS   214
3  1982          10291      HEERDE  1828
4  1982          10290     HETEREN   955

[5 rows x 4 columns]

Now let's calculate total values for each city and show first 20 locations


In [27]:
newframe = df[['amsterdam_code', 'value']][:20]
print newframe


   amsterdam_code value
0           10298   273
1           10297   194
2           10296   214
3           10291  1828
4           10290   955
5           10292   476
6           10453  2190
7           10451   264
8           10450   117
9           10457  1163
10          10456   392
11          10455   358
12          10454  1372
13          10512  2993
14          10104  1659
15          10511  3233
16          10101   839
17          10517  8790
18          10103  2600
19          10515  1090

[20 rows x 2 columns]

We need some basic color maps to calculate from out data


In [28]:
def colormapslimits(dataframe):
    scale = []
    frame1 = []
    frame2 = []
    avg = values.median()
    for value in dataframe:
        if value <= avg:
            frame1.append(value)
        else:
            frame2.append(value)
    avg1 = pd.DataFrame(frame1).median()
    avg2 = pd.DataFrame(frame2).median()
            
    return (values.min(), int(avg1), int(avg), int(avg2), values.max())

In [29]:
values = newframe['value'][:20]
dfnames = df['naam'][:20]
codes = [df['amsterdam_code'][:20]]
list_data = []
names = []

for value in values:
    list_data.append(value)
for name in dfnames:
    names.append(name)

# New dataframe to make chart
thisDF = pd.DataFrame(list_data, names)

colormap = colormapslimits(values)
print names
print list_data
print 'Limits to build color map: ' + str(colormap)

bar = vincent.Bar(thisDF)
bar.axes[0].properties = AxisProperties (
    labels=PropertySet(
    angle=ValueRef(value=45),
    align=ValueRef(value='left')
    )
   )
vincent.core.initialize_notebook()

bar.axis_titles(x='', y='Value')
bar.display()


['OLDEHOVE', 'HAGESTEIN', 'NOORDELOOS', 'HEERDE', 'HETEREN', 'BELFELD', 'WINSCHOTEN', 'BLESKENSGRAAF EN HOFWEGEN', 'GOUDRIAAN', 'SUSTEREN', 'POSTERHOLT', 'RIETHOVEN', 'WONSERADEEL', 'NUNSPEET', 'NUTH', 'SCHAGEN', 'GEERTRUIDENBERG', 'ALPHEN AAN DEN RIJN ZH', 'OISTERWIJK', 'SIMPELVELD']
[273.0, 194.0, 214.0, 1828.0, 955.0, 476.0, 2190.0, 264.0, 117.0, 1163.0, 392.0, 358.0, 1372.0, 2993.0, 1659.0, 3233.0, 839.0, 8790.0, 2600.0, 1090.0]
Limits to build color map: (117.0, 315, 1022, 2009, 8790.0)

The same data on Pie chart


In [30]:
pie = vincent.Pie(thisDF)
pie.colors(brew='Set3')
pie.legend('Locations')


Out[30]:

Create final dataset with amsterdam codes, values and colors


In [31]:
print "Colors for visualization of locations on map"
for value in list_data:
    # 42.0, 75, 221, 321, 2331.0
    for i in range(len(colormap)):
        min, max = colormap[i-1], colormap[i]

        if value >= min:
            if value < max:
                print str(value) + ' ' + colors[i]


Colors for visualization of locations on map
273.0 green
194.0 green
214.0 green
1828.0 brown
955.0 orange
476.0 orange
2190.0 purple
264.0 green
117.0 green
1163.0 brown
392.0 orange
358.0 orange
1372.0 brown
2993.0 purple
1659.0 brown
3233.0 purple
839.0 orange
2600.0 purple
1090.0 brown

To do: visualize dataset on map of Netherlands