In [1]:
#use Python 2
import pandas as pd, copy

In [37]:
#UNHCR db 1951-2017

#http://popstats.unhcr.org/en/overview#_ga=1.40654370.1278371767.1434418671  - person of concern
df=pd.read_csv('unhcr_popstats_export_persons_of_concern_all_data.csv',skiprows=3)

In [38]:
df


Out[38]:
Year Country / territory of asylum/residence Origin Refugees (incl. refugee-like situations) Asylum-seekers (pending cases) Returned refugees Internally displaced persons (IDPs) Returned IDPs Stateless persons Others of concern Total Population
0 1951 Australia Various/Unknown 180000 NaN NaN NaN NaN NaN NaN 180000
1 1951 Austria Various/Unknown 282000 NaN NaN NaN NaN NaN NaN 282000
2 1951 Belgium Various/Unknown 55000 NaN NaN NaN NaN NaN NaN 55000
3 1951 Canada Various/Unknown 168511 NaN NaN NaN NaN NaN NaN 168511
4 1951 Switzerland Various/Unknown 10000 NaN NaN NaN NaN NaN NaN 10000
5 1951 Germany Various/Unknown 265000 NaN NaN NaN NaN NaN NaN 265000
6 1951 Denmark Various/Unknown 2000 NaN NaN NaN NaN NaN NaN 2000
7 1951 Spain Various/Unknown 2000 NaN NaN NaN NaN NaN NaN 2000
8 1951 France Various/Unknown 290000 NaN NaN NaN NaN NaN NaN 290000
9 1951 United Kingdom Various/Unknown 208000 NaN NaN NaN NaN NaN NaN 208000
10 1951 Greece Various/Unknown 18000 NaN NaN NaN NaN NaN NaN 18000
11 1951 China, Hong Kong SAR Various/Unknown 30000 NaN NaN NaN NaN NaN NaN 30000
12 1951 Italy Various/Unknown 26500 NaN NaN NaN NaN NaN NaN 26500
13 1951 Luxembourg Various/Unknown 1800 NaN NaN NaN NaN NaN NaN 1800
14 1951 Morocco Various/Unknown 3000 NaN NaN NaN NaN NaN NaN 3000
15 1951 Netherlands Various/Unknown 14200 NaN NaN NaN NaN NaN NaN 14200
16 1951 Norway Various/Unknown 2500 NaN NaN NaN NaN NaN NaN 2500
17 1951 Sweden Various/Unknown 44000 NaN NaN NaN NaN NaN NaN 44000
18 1951 Tunisia Various/Unknown 2000 NaN NaN NaN NaN NaN NaN 2000
19 1951 Turkey Various/Unknown 2700 NaN NaN NaN NaN NaN NaN 2700
20 1951 United States of America Various/Unknown 350000 NaN NaN NaN NaN NaN NaN 350000
21 1952 Austria Various/Unknown 238200 NaN NaN NaN NaN NaN NaN 238200
22 1952 Belgium Various/Unknown 53500 NaN NaN NaN NaN NaN NaN 53500
23 1952 Canada Various/Unknown 154828 NaN NaN NaN NaN NaN NaN 154828
24 1952 Switzerland Various/Unknown 9800 NaN NaN NaN NaN NaN NaN 9800
25 1952 Germany Various/Unknown 240000 NaN NaN NaN NaN NaN NaN 240000
26 1952 Denmark Various/Unknown 1800 NaN NaN NaN NaN NaN NaN 1800
27 1952 France Various/Unknown 280000 NaN NaN NaN NaN NaN NaN 280000
28 1952 United Kingdom Various/Unknown 200000 NaN NaN NaN NaN NaN NaN 200000
29 1952 Greece Various/Unknown 19000 NaN NaN NaN NaN NaN NaN 19000
... ... ... ... ... ... ... ... ... ... ... ...
124439 2017 Zambia Turkey 12 * 0 0.0 0.0 0 0 16
124440 2017 Zambia Uganda 21 0 0 0.0 0.0 0 0 21
124441 2017 Zambia Ukraine 0 * 0 0.0 0.0 0 0 *
124442 2017 Zambia Various/Unknown 71 0 0 0.0 0.0 0 0 71
124443 2017 Zambia South Africa 7 0 0 0.0 0.0 0 0 7
124444 2017 Zambia Zimbabwe 5 0 0 0.0 0.0 0 0 5
124445 2017 Zimbabwe Angola * * 0 0.0 0.0 0 * *
124446 2017 Zimbabwe Burundi 622 17 0 0.0 0.0 0 44 683
124447 2017 Zimbabwe Botswana 0 * 0 0.0 0.0 0 * *
124448 2017 Zimbabwe Central African Rep. 0 0 0 0.0 0.0 0 * *
124449 2017 Zimbabwe Côte d'Ivoire 10 0 0 0.0 0.0 0 * 11
124450 2017 Zimbabwe Dem. Rep. of the Congo 6296 245 0 0.0 0.0 0 2406 8947
124451 2017 Zimbabwe Congo 13 * 0 0.0 0.0 0 0 14
124452 2017 Zimbabwe Egypt 5 0 0 0.0 0.0 0 0 5
124453 2017 Zimbabwe Eritrea 20 * 0 0.0 0.0 0 * 28
124454 2017 Zimbabwe Ethiopia 68 14 0 0.0 0.0 0 * 83
124455 2017 Zimbabwe Ghana 0 0 0 0.0 0.0 0 * *
124456 2017 Zimbabwe Kenya 0 8 0 0.0 0.0 0 * 10
124457 2017 Zimbabwe Lebanon * 0 0 0.0 0.0 0 0 *
124458 2017 Zimbabwe Mali * * 0 0.0 0.0 0 * 5
124459 2017 Zimbabwe Mozambique 0 8059 0 0.0 0.0 0 0 8059
124460 2017 Zimbabwe Rwanda 500 13 0 0.0 0.0 0 239 752
124461 2017 Zimbabwe Sudan 5 9 0 0.0 0.0 0 9 23
124462 2017 Zimbabwe Somalia 19 8 0 0.0 0.0 0 9 36
124463 2017 Zimbabwe Slovenia * 0 0 0.0 0.0 0 0 *
124464 2017 Zimbabwe Syrian Arab Rep. 0 0 0 0.0 0.0 0 * *
124465 2017 Zimbabwe Uganda 8 * 0 0.0 0.0 0 * 13
124466 2017 Zimbabwe South Africa * 9 0 0.0 0.0 0 5 15
124467 2017 Zimbabwe Zambia 0 0 0 0.0 0.0 0 * *
124468 2017 Zimbabwe Zimbabwe 0 0 38 0.0 0.0 0 137 175

124469 rows × 11 columns


In [39]:
df.columns=['year','target','source','a','b','c','d','e','f','g','value']
df=df.drop(['a','b','c','d','e','f','g',],axis=1).dropna()#.set_index(['year','target','source'])[:80440]
df.head(5)


Out[39]:
year target source value
0 1951 Australia Various/Unknown 180000
1 1951 Austria Various/Unknown 282000
2 1951 Belgium Various/Unknown 55000
3 1951 Canada Various/Unknown 168511
4 1951 Switzerland Various/Unknown 10000

In [54]:
df['value']=df['value'].replace('*',0).astype(int)

In [55]:
df.set_index(['year','target','source']).loc[2014].loc['Pakistan']


Out[55]:
value
source
Afghanistan 1523363
Algeria 24
Ethiopia 5
Ghana 1
India 1
Iran (Islamic Rep. of) 66
Iraq 53
Myanmar 23
Nigeria 2
Pakistan 1451729
Palestinian 16
Russian Federation 7
Rwanda 1
Sudan 3
Somalia 446
Syrian Arab Rep. 17
Turkey 7
Uganda 1
Uzbekistan 7

palestinian data, from world bank


In [56]:
import numpy as np
def interpolate(d,years,gfit=2,depth=1,polyorder=1,override=True):
    #depth * length of interpolation substrings will be taken to the left and right
    #for example for {1971:5,1972:6,1973:7,1974:5} interpolating it over 1969-1990
    #for the section 1960-1970 (2 elements) the values from 1972,1973,1974 (3 elements) will be taken with depth 1.5
    #for the section 1974-1990 (15 elements) all values  (4 elements) will be taken to extrapolate
    if (gfit>2): 
        print 'interpolate takes only 1 (polynomial) or 2 (exponential) as 3rd argument [default=2]'
        return
    mydict={}
    missing_points=[[]]
    for year in years:
        if year not in d.keys():
            missing_points[-1].append(year)
        else:
            missing_points.append([])
    for m in missing_points:
        if m:
            fit=gfit
            if ((m[-1]<np.sort(d.keys())[0])|(m[0]>np.sort(d.keys())[-1])): #check if it is ends of the interval, then extrapolate mean only
                if not override: fit=0
            
            if fit==0: #take average
                y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(3)),min(max(years),max(m)+int(3))+1))}
                for i in range(len(m)):
                    mydict[m[i]]=np.mean(y.values())
            elif fit==1:
                #intersector
                y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))}
                #print y
                w = np.polyfit(y.keys(),y.values(),polyorder) # obtaining regression parameters
                if (polyorder==1):
                    intersector=w[0]*np.array(m)+w[1]
                else:
                    intersector=w[0]*np.array(m)*np.array(m)+w[1]*np.array(m)+w[2]
                for i in range(len(m)):
                    mydict[m[i]]=max(0,intersector[i])
            else:
                #intersector
                y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))}
                #print y
                w = np.polyfit(y.keys(),np.log(y.values()),1) # obtaining log regression parameters (exp fitting)
                intersector=np.exp(w[1])*np.exp(w[0]*np.array(m))
                for i in range(len(m)):
                    mydict[m[i]]=max(0,intersector[i])
    
    #return interpolated points
    return mydict

In [57]:
#http://data.worldbank.org/indicator/SM.POP.REFG/countries?display=default
dz=pd.read_csv('API_SM.POP.REFG_DS2_en_csv_v2_10035903.csv',skiprows=3,header=0)
dz.columns=['country','cc','a','b']+range(1960,2019)
dz=dz.set_index('country').drop(['cc','a','b']+range(1960,1975)+range(2014,2016),axis=1)

In [59]:
ccc={'Syrian Arab Republic':'Syria','West Bank and Gaza':'Palestine','Jordan':'Jordan','Lebanon':'Lebanon'}
ccr={'Syrian Arab Republic':'Syrian Arab Rep.','West Bank and Gaza':'State of Palestine','Jordan':'Jordan','Lebanon':'Lebanon'}

In [60]:
dx=pd.concat([pd.DataFrame(dz.loc['Jordan']).T,pd.DataFrame(dz.loc['Lebanon']).T\
              ,pd.DataFrame(dz.loc['Syrian Arab Republic']).T,pd.DataFrame(dz.loc['West Bank and Gaza']).T])

In [61]:
#reset df, without setting index to run this parts
palref=['Jordan','Lebanon','Syrian Arab Republic','West Bank and Gaza']
dc=pd.DataFrame(columns=['year','target','source','value'])
    
for k in range(4):
    hp={}
    for i in dx.loc[palref[k]].iteritems():
        if ~np.isnan(i[1]):
            hp[i[0]]=i[1]
    hp.update(interpolate(hp,range(1951,2018)))
    for y in hp:
        val=hp[y]-df[((df['target']==ccr[palref[k]])&(df['year']==y))].sum()[3]
        dc.loc[y+k*1000]=[y,ccc[palref[k]],'Palestine',val]


C:\Users\csala\AppData\Local\Continuum\anaconda2\lib\site-packages\ipykernel_launcher.py:42: RankWarning: Polyfit may be poorly conditioned
C:\Users\csala\AppData\Local\Continuum\anaconda2\lib\site-packages\ipykernel_launcher.py:42: RankWarning: Polyfit may be poorly conditioned
C:\Users\csala\AppData\Local\Continuum\anaconda2\lib\site-packages\ipykernel_launcher.py:42: RankWarning: Polyfit may be poorly conditioned
C:\Users\csala\AppData\Local\Continuum\anaconda2\lib\site-packages\ipykernel_launcher.py:42: RankWarning: Polyfit may be poorly conditioned

append plaestine data to main dataframe


In [63]:
df=pd.concat([df,dc])

save data


In [64]:
def cc(country):
    if country in cc2:
        return cc2[country]
    else: return country
cc2={
'Bolivia (Plurinational State of)':'Bolivia',
'Micronesia (Federated States of)':'Micronesia',
'Serbia (and Kosovo: S/RES/1244 (1999))':'Serbia & Kosovo',
'The former Yugoslav Rep. of Macedonia':'FYROM',
'Venezuela (Bolivarian Republic of)':'Venezuela',
'Dem. Rep. of the Congo':'DRC',
'Central African Rep.':'CAR',
"Dem. People's Rep. of Korea":"North Korea",
'Islamic Rep. of Iran':'Iran',
"Lao People's Dem. Rep.":'Lao PDR',
'Papua New Guinea':'PNG',
'Syrian Arab Rep.':'Syria',
'United Rep. of Tanzania':'Tanzania',
'United Arab Emirates':'UAE',
'Antigua and Barbuda':'Antig. & Barb',
'Bosnia and Herzegovina':'Bosnia & Herz.',
'British Virgin Islands':'UK Virgin',
'Brunei Darussalam':'Brunei',
'Hong Kong SAR, China':'Hong Kong',
'Macao SAR, China':'Macao',
'Russian Federation':'Russia',
'Saint Kitts and Nevis':'St. Kitts & Nev.',
'Saint Vincent and the Grenadines':'St. Vinc. & Gren.',
'Sint Maarten (Dutch part)':'St. Maarten',
'State of Palestine':'Palestine',
'Trinidad and Tobago':'Trinid. & Tob.',
'Turks and Caicos Islands':'Turks & Caicos'
}

In [65]:
data={}
countries=set()
for i in df.T.iteritems():
    year=int(i[1][0])
    target=cc(i[1][1])
    source=cc(i[1][2])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    countries.add(target)
    countries.add(source)
    if year not in data:data[year]={}
    if ((source in data[year]) and (target in data[year][source])):
            data[year][source][target][1]=value
    else:
        if target not in data[year]:data[year][target]={}
        if source not in data[year][target]:data[year][target][source]=[0,0]
        data[year][target][source][0]=value

In [66]:
dk=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
c=0
for year in data:
    for target in data[year]:
        for source in data[year][target]:
            dk.loc[c]=[target,source,year,data[year][target][source][0],data[year][target][source][1]]
            c+=1
    print year,'0'


1951 0
1952 0
1953 0
1954 0
1955 0
1956 0
1957 0
1958 0
1959 0
1960 0
1961 0
1962 0
1963 0
1964 0
1965 0
1966 0
1967 0
1968 0
1969 0
1970 0
1971 0
1972 0
1973 0
1974 0
1975 0
1976 0
1977 0
1978 0
1979 0
1980 0
1981 0
1982 0
1983 0
1984 0
1985 0
1986 0
1987 0
1988 0
1989 0
1990 0
1991 0
1992 0
1993 0
1994 0
1995 0
1996 0
1997 0
1998 0
1999 0
2000 0
2001 0
2002 0
2003 0
2004 0
2005 0
2006 0
2007 0
2008 0
2009 0
2010 0
2011 0
2012 0
2013 0
2014 0
2015 0
2016 0
2017 0

In [67]:
dk=dk.set_index(['year','importer1','importer2'])
dk.to_csv('datab.csv')

XXXXXXXXXXX done

optional from here on

other direction flow


In [ ]:


In [68]:
data={}
countries=set()
for i in df.T.iteritems():
    year=int(i[1][2])
    target=cc(i[1][0])
    source=cc(i[1][1])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    countries.add(target)
    countries.add(source)
    if year not in data:data[year]={}
    if ((source in data[year]) and (target in data[year][source])):
            data[year][source][target][1]=value
    else:
        if target not in data[year]:data[year][target]={}
        if source not in data[year][target]:data[year][target][source]=[0,0]
        data[year][target][source][0]=value  
dk=pd.DataFrame(columns=['importer2','importer1','year','flow1','flow2'])
c=0
for year in data:
    for target in data[year]:
        for source in data[year][target]:
            dk.loc[c]=[target,source,year,data[year][target][source][0],data[year][target][source][1]]
            c+=1
    print year,'0'
dk=dk.set_index(['year','importer2','importer1'])
dk.to_csv('data.csv')


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-68-ebd91629ce08> in <module>()
      2 countries=set()
      3 for i in df.T.iteritems():
----> 4     year=int(i[1][2])
      5     target=cc(i[1][0])
      6     source=cc(i[1][1])

ValueError: invalid literal for int() with base 10: 'Various/Unknown'

extra experimentations


In [ ]:
data2={}
countries2=set()
for i in df.T.iteritems():
    year=int(i[1][2])
    target=cc(i[1][0])
    source=cc(i[1][1])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    countries2.add(target)
    countries2.add(source)
    if value<10000:
        target="Other"
    if year not in data2:data2[year]={}
    if ((source in data2[year]) and (target in data2[year][source])):
        data2[year][source][target][1]+=value
    else:
        target=cc(i[1][0])
        source=cc(i[1][1])
        if target=="Various":
            target="Other"
        if source=="Various":
            source="Other"
        if value<10000:
            source="Other"
        if target not in data2[year]:data2[year][target]={}
        if source not in data2[year][target]:data2[year][target][source]=[0,0]
        
        data2[year][target][source][0]+=value

In [ ]:
dk2=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
c=0
for year in data2:
    for target in data2[year]:
        for source in data2[year][target]:
            dk2.loc[c]=[target,source,year,data2[year][target][source][0],data2[year][target][source][1]]
            c+=1
    print year,'1'

In [ ]:
dk2=dk2.set_index(['year','importer1','importer2'])
dk2.to_csv('data2.csv')

In [ ]:
data3={}
for i in df.T.iteritems():
    year=int(i[1][2])
    target=cc(i[1][0])
    source=cc(i[1][1])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    if value<0:
        target="Other"
    if year not in data3:data3[year]={}
    if ((source in data3[year]) and (target in data3[year][source])):
        data3[year][source][target][1]+=value
    else:
        target=cc(i[1][0])
        source=cc(i[1][1])
        if target=="Various":
            target="Other"
        if source=="Various":
            source="Other"
        if value<0:
            source="Other"
        if target not in data3[year]:data3[year][target]={}
        if source not in data3[year][target]:data3[year][target][source]=[0,0]
        
        data3[year][target][source][0]+=value

In [ ]:
#for th in {1000,5000,10000,50000,100000}:
for th in {10000,100000}:    
    dk3=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
    c=0    
    for year in data3:
        p=0
        dk3.loc[c]=['Other','Other',year,0,0]
        r=copy.deepcopy(c)
        c+=1
        for target in data3[year]:
            for source in data3[year][target]:
                if ((source!='Other') and (target!='Other')):
                    if ((data3[year][target][source][0]>th)or(data3[year][target][source][1]>th)):
                        dk3.loc[c]=[target,source,year,data3[year][target][source][0],data3[year][target][source][1]]
                    else: p+=data3[year][target][source][0]+data3[year][target][source][1]
                else: dk3.loc[r]['flow1']+=data3[year][target][source][0]+data3[year][target][source][1]
                c+=1
        dk3[(dk3['year']==year)&(dk3['importer2']=='Other')&(dk3['importer1']=='Other')]['flow1']+=p
    dk3=dk3.set_index(['year','importer1','importer2'])
    dk3.to_csv(repr(th)+'data3.csv')
    dk3b=dk3.copy()
    dk3b.index.names=[u'year', u'importer2', u'importer1']
    dk3b.to_csv(repr(th)+'data3b.csv')

In [ ]:
dk3.loc[1976]

In [ ]: