In [1]:
import pandas as pd, copy

In [2]:
#UNDATA dbd 1975-2013

#http://data.un.org/Data.aspx?d=UNHCR&f=indID%3AType-Ref
#doanload data in batches, query limited to 50.000
df1=pd.read_csv('1985.csv') #years 1975-1984
df2=pd.read_csv('1995.csv') #years 1985-1994
df3=pd.read_csv('2005.csv') #years 1995-2004
df4=pd.read_csv('2015.csv') #years 2005-2013

In [ ]:
#UNHCR db 2001-2014

#http://popstats.unhcr.org/en/overview#_ga=1.40654370.1278371767.1434418671  - select time series
df=pd.read_csv('all_data.csv',skiprows=3)

In [30]:
#UNHCR db 1951-2014

#http://popstats.unhcr.org/en/overview#_ga=1.40654370.1278371767.1434418671  - person of concern
df=pd.read_csv('all_data2.csv',skiprows=3)

In [31]:
df


Out[31]:
Year Country / territory of asylum/residence Origin / Returned from Refugees Asylum-seekers (pending cases) Returned refugees Internally displaced persons (IDPs) Returned IDPs Stateless persons Others of concern Total Population
0 1951 NaN Various 8800 NaN NaN NaN NaN NaN NaN 8800
1 1951 NaN Various 120000 NaN NaN NaN NaN NaN NaN 120000
2 1951 Australia Various 180000 NaN NaN NaN NaN NaN NaN 180000
3 1951 Austria Various 282000 NaN NaN NaN NaN NaN NaN 282000
4 1951 Belgium Various 55000 NaN NaN NaN NaN NaN NaN 55000
5 1951 Canada Various 168511 NaN NaN NaN NaN NaN NaN 168511
6 1951 Denmark Various 2000 NaN NaN NaN NaN NaN NaN 2000
7 1951 France Various 290000 NaN NaN NaN NaN NaN NaN 290000
8 1951 Germany Various 265000 NaN NaN NaN NaN NaN NaN 265000
9 1951 Greece Various 18000 NaN NaN NaN NaN NaN NaN 18000
10 1951 Hong Kong SAR, China Various 30000 NaN NaN NaN NaN NaN NaN 30000
11 1951 Italy Various 26500 NaN NaN NaN NaN NaN NaN 26500
12 1951 Luxembourg Various 1800 NaN NaN NaN NaN NaN NaN 1800
13 1951 Morocco Various 3000 NaN NaN NaN NaN NaN NaN 3000
14 1951 Netherlands Various 14200 NaN NaN NaN NaN NaN NaN 14200
15 1951 Norway Various 2500 NaN NaN NaN NaN NaN NaN 2500
16 1951 Spain Various 2000 NaN NaN NaN NaN NaN NaN 2000
17 1951 Sweden Various 44000 NaN NaN NaN NaN NaN NaN 44000
18 1951 Switzerland Various 10000 NaN NaN NaN NaN NaN NaN 10000
19 1951 Tunisia Various 2000 NaN NaN NaN NaN NaN NaN 2000
20 1951 Turkey Various 2700 NaN NaN NaN NaN NaN NaN 2700
21 1951 United Kingdom Various 208000 NaN NaN NaN NaN NaN NaN 208000
22 1951 United States Various 350000 NaN NaN NaN NaN NaN NaN 350000
23 1951 Various Various 30000 NaN NaN NaN NaN NaN NaN 30000
24 1952 NaN Various 14000 NaN NaN NaN NaN NaN NaN 14000
25 1952 NaN Various 120000 NaN NaN NaN NaN NaN NaN 120000
26 1952 Austria Various 238200 NaN NaN NaN NaN NaN NaN 238200
27 1952 Belgium Various 53500 NaN NaN NaN NaN NaN NaN 53500
28 1952 Canada Various 154828 NaN NaN NaN NaN NaN NaN 154828
29 1952 Denmark Various 1800 NaN NaN NaN NaN NaN NaN 1800
... ... ... ... ... ... ... ... ... ... ... ...
88272 2014 Hungary Zimbabwe 4 NaN NaN NaN NaN NaN NaN 4
88273 2014 Ireland Zimbabwe 157 NaN NaN NaN NaN NaN NaN 157
88274 2014 Israel Zimbabwe 1 NaN NaN NaN NaN NaN NaN 1
88275 2014 Italy Zimbabwe 27 NaN NaN NaN NaN NaN NaN 27
88276 2014 Kenya Zimbabwe 4 NaN NaN NaN NaN NaN NaN 4
88277 2014 Lesotho Zimbabwe 7 NaN NaN NaN NaN NaN NaN 7
88278 2014 Lithuania Zimbabwe 1 NaN NaN NaN NaN NaN NaN 1
88279 2014 Malaysia Zimbabwe 2 NaN NaN NaN NaN NaN NaN 2
88280 2014 Namibia Zimbabwe 32 NaN NaN NaN NaN NaN NaN 32
88281 2014 Netherlands Zimbabwe 45 NaN NaN NaN NaN NaN NaN 45
88282 2014 New Zealand Zimbabwe 47 NaN NaN NaN NaN NaN NaN 47
88283 2014 Norway Zimbabwe 23 NaN NaN NaN NaN NaN NaN 23
88284 2014 Panama Zimbabwe 2 NaN NaN NaN NaN NaN NaN 2
88285 2014 Poland Zimbabwe 1 NaN NaN NaN NaN NaN NaN 1
88286 2014 Portugal Zimbabwe 1 NaN NaN NaN NaN NaN NaN 1
88287 2014 Rep. of Moldova Zimbabwe 1 NaN NaN NaN NaN NaN NaN 1
88288 2014 Romania Zimbabwe 5 NaN NaN NaN NaN NaN NaN 5
88289 2014 Slovenia Zimbabwe 2 NaN NaN NaN NaN NaN NaN 2
88290 2014 South Africa Zimbabwe 6217 NaN NaN NaN NaN NaN NaN 6217
88291 2014 Spain Zimbabwe 2 NaN NaN NaN NaN NaN NaN 2
88292 2014 Sudan Zimbabwe 1 NaN NaN NaN NaN NaN NaN 1
88293 2014 Swaziland Zimbabwe 7 NaN NaN NaN NaN NaN NaN 7
88294 2014 Sweden Zimbabwe 41 NaN NaN NaN NaN NaN NaN 41
88295 2014 Switzerland Zimbabwe 14 NaN NaN NaN NaN NaN NaN 14
88296 2014 Thailand Zimbabwe 1 NaN NaN NaN NaN NaN NaN 1
88297 2014 Ukraine Zimbabwe 1 NaN NaN NaN NaN NaN NaN 1
88298 2014 United Kingdom Zimbabwe 9467 NaN NaN NaN NaN NaN NaN 9467
88299 2014 United States Zimbabwe 1492 NaN NaN NaN NaN NaN NaN 1492
88300 2014 Uruguay Zimbabwe 1 NaN NaN NaN NaN NaN NaN 1
88301 2014 Zambia Zimbabwe 6 NaN NaN NaN NaN NaN NaN 6

88302 rows × 11 columns


In [32]:
df.columns=['year','target','source','a','b','c','d','e','f','g','value']
df=df.drop(['a','b','c','d','e','f','g',],axis=1).dropna()#.set_index(['year','target','source'])[:80440]
df.head(50)


Out[32]:
year target source value
2 1951 Australia Various 180000
3 1951 Austria Various 282000
4 1951 Belgium Various 55000
5 1951 Canada Various 168511
6 1951 Denmark Various 2000
7 1951 France Various 290000
8 1951 Germany Various 265000
9 1951 Greece Various 18000
10 1951 Hong Kong SAR, China Various 30000
11 1951 Italy Various 26500
12 1951 Luxembourg Various 1800
13 1951 Morocco Various 3000
14 1951 Netherlands Various 14200
15 1951 Norway Various 2500
16 1951 Spain Various 2000
17 1951 Sweden Various 44000
18 1951 Switzerland Various 10000
19 1951 Tunisia Various 2000
20 1951 Turkey Various 2700
21 1951 United Kingdom Various 208000
22 1951 United States Various 350000
23 1951 Various Various 30000
26 1952 Austria Various 238200
27 1952 Belgium Various 53500
28 1952 Canada Various 154828
29 1952 Denmark Various 1800
30 1952 France Various 280000
31 1952 Germany Various 240000
32 1952 Greece Various 19000
33 1952 Italy Various 25500
34 1952 Luxembourg Various 1800
35 1952 Morocco Various 2900
36 1952 Netherlands Various 14100
37 1952 Norway Various 2500
38 1952 Sweden Various 42000
39 1952 Switzerland Various 9800
40 1952 Turkey Various 2000
41 1952 United Kingdom Various 200000
42 1952 United States Various 500000
43 1952 Various Various 31000
46 1953 Austria Various 215200
47 1953 Belgium Various 53000
48 1953 Canada Various 107004
49 1953 Denmark Various 1600
50 1953 France Various 270000
51 1953 Germany Various 228000
52 1953 Greece Various 18500
53 1953 Italy Various 24500
54 1953 Luxembourg Various 1800
55 1953 Morocco Various 2800

In [6]:
df.set_index(['year','target','source']).loc[2014].loc['Pakistan']


Out[6]:
value
source
Afghanistan 1517903
Algeria 17
Ethiopia 4
Ghana 1
Iraq 43
Islamic Rep. of Iran 56
Myanmar 23
Pakistan 1451729
Russian Federation 7
Rwanda 1
Somalia 412
State of Palestine 16
Sudan 2
Syrian Arab Rep. 16
Turkey 7
Uganda 1
Uzbekistan 7

palestinian data, from world bank


In [7]:
import numpy as np
def interpolate(d,years,gfit=2,depth=1,polyorder=1,override=True):
    #depth * length of interpolation substrings will be taken to the left and right
    #for example for {1971:5,1972:6,1973:7,1974:5} interpolating it over 1969-1990
    #for the section 1960-1970 (2 elements) the values from 1972,1973,1974 (3 elements) will be taken with depth 1.5
    #for the section 1974-1990 (15 elements) all values  (4 elements) will be taken to extrapolate
    if (gfit>2): 
        print 'interpolate takes only 1 (polynomial) or 2 (exponential) as 3rd argument [default=2]'
        return
    mydict={}
    missing_points=[[]]
    for year in years:
        if year not in d.keys():
            missing_points[-1].append(year)
        else:
            missing_points.append([])
    for m in missing_points:
        if m:
            fit=gfit
            if ((m[-1]<np.sort(d.keys())[0])|(m[0]>np.sort(d.keys())[-1])): #check if it is ends of the interval, then extrapolate mean only
                if not override: fit=0
            
            if fit==0: #take average
                y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(3)),min(max(years),max(m)+int(3))+1))}
                for i in range(len(m)):
                    mydict[m[i]]=np.mean(y.values())
            elif fit==1:
                #intersector
                y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))}
                #print y
                w = np.polyfit(y.keys(),y.values(),polyorder) # obtaining regression parameters
                if (polyorder==1):
                    intersector=w[0]*np.array(m)+w[1]
                else:
                    intersector=w[0]*np.array(m)*np.array(m)+w[1]*np.array(m)+w[2]
                for i in range(len(m)):
                    mydict[m[i]]=max(0,intersector[i])
            else:
                #intersector
                y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))}
                #print y
                w = np.polyfit(y.keys(),np.log(y.values()),1) # obtaining log regression parameters (exp fitting)
                intersector=np.exp(w[1])*np.exp(w[0]*np.array(m))
                for i in range(len(m)):
                    mydict[m[i]]=max(0,intersector[i])
    
    #return interpolated points
    return mydict

In [19]:
#http://data.worldbank.org/indicator/SM.POP.REFG/countries?display=default
dz=pd.read_csv('pal.csv',skiprows=2)
dz.columns=['country','cc','a','b']+range(1960,2016)
dz=dz.set_index('country').drop(['cc','a','b']+range(1960,1975)+range(2014,2016),axis=1)

In [20]:
ccc={'Syrian Arab Republic':'Syria','West Bank and Gaza':'Palestine','Jordan':'Jordan','Lebanon':'Lebanon'}
ccr={'Syrian Arab Republic':'Syrian Arab Rep.','West Bank and Gaza':'State of Palestine','Jordan':'Jordan','Lebanon':'Lebanon'}

In [21]:
dx=pd.concat([pd.DataFrame(dz.loc['Jordan']).T,pd.DataFrame(dz.loc['Lebanon']).T\
              ,pd.DataFrame(dz.loc['Syrian Arab Republic']).T,pd.DataFrame(dz.loc['West Bank and Gaza']).T])

In [33]:
#reset df, without setting index to run this parts
palref=['Jordan','Lebanon','Syrian Arab Republic','West Bank and Gaza']
dc=pd.DataFrame(columns=['year','target','source','value'])
    
for k in range(4):
    hp={}
    for i in dx.loc[palref[k]].iteritems():
        if ~np.isnan(i[1]):
            hp[i[0]]=i[1]
    hp.update(interpolate(hp,range(1951,2015)))
    for y in hp:
        val=hp[y]-df[((df['target']==ccr[palref[k]])&(df['year']==y))].sum()[3]
        dc.loc[y+k*1000]=[y,ccc[palref[k]],'Palestine',val]


C:\Anaconda\lib\site-packages\numpy\lib\polynomial.py:588: RankWarning: Polyfit may be poorly conditioned
  warnings.warn(msg, RankWarning)
C:\Anaconda\lib\site-packages\numpy\lib\polynomial.py:588: RankWarning: Polyfit may be poorly conditioned
  warnings.warn(msg, RankWarning)
C:\Anaconda\lib\site-packages\numpy\lib\polynomial.py:588: RankWarning: Polyfit may be poorly conditioned
  warnings.warn(msg, RankWarning)
C:\Anaconda\lib\site-packages\numpy\lib\polynomial.py:588: RankWarning: Polyfit may be poorly conditioned
  warnings.warn(msg, RankWarning)

append plaestine data to main dataframe


In [34]:
df=pd.concat([df,dc])

save data


In [37]:
def cc(country):
    if country in cc2:
        return cc2[country]
    else: return country
cc2={
'Bolivia (Plurinational State of)':'Bolivia',
'Micronesia (Federated States of)':'Micronesia',
'Serbia (and Kosovo: S/RES/1244 (1999))':'Serbia & Kosovo',
'The former Yugoslav Rep. of Macedonia':'FYROM',
'Venezuela (Bolivarian Republic of)':'Venezuela',
'Dem. Rep. of the Congo':'DRC',
'Central African Rep.':'CAR',
"Dem. People's Rep. of Korea":"North Korea",
'Islamic Rep. of Iran':'Iran',
"Lao People's Dem. Rep.":'Lao PDR',
'Papua New Guinea':'PNG',
'Syrian Arab Rep.':'Syria',
'United Rep. of Tanzania':'Tanzania',
'United Arab Emirates':'UAE',
'Antigua and Barbuda':'Antig. & Barb',
'Bosnia and Herzegovina':'Bosnia & Herz.',
'British Virgin Islands':'UK Virgin',
'Brunei Darussalam':'Brunei',
'Hong Kong SAR, China':'Hong Kong',
'Macao SAR, China':'Macao',
'Russian Federation':'Russia',
'Saint Kitts and Nevis':'St. Kitts & Nev.',
'Saint Vincent and the Grenadines':'St. Vinc. & Gren.',
'Sint Maarten (Dutch part)':'St. Maarten',
'State of Palestine':'Palestine',
'Trinidad and Tobago':'Trinid. & Tob.',
'Turks and Caicos Islands':'Turks & Caicos'
}

In [38]:
data={}
countries=set()
for i in df.T.iteritems():
    year=int(i[1][0])
    target=cc(i[1][1])
    source=cc(i[1][2])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    countries.add(target)
    countries.add(source)
    if year not in data:data[year]={}
    if ((source in data[year]) and (target in data[year][source])):
            data[year][source][target][1]=value
    else:
        if target not in data[year]:data[year][target]={}
        if source not in data[year][target]:data[year][target][source]=[0,0]
        data[year][target][source][0]=value

In [39]:
dk=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
c=0
for year in data:
    for target in data[year]:
        for source in data[year][target]:
            dk.loc[c]=[target,source,year,data[year][target][source][0],data[year][target][source][1]]
            c+=1
    print year,'0'


1951 0
1952 0
1953 0
1954 0
1955 0
1956 0
1957 0
1958 0
1959 0
1960 0
1961 0
1962 0
1963 0
1964 0
1965 0
1966 0
1967 0
1968 0
1969 0
1970 0
1971 0
1972 0
1973 0
1974 0
1975 0
1976 0
1977 0
1978 0
1979 0
1980 0
1981 0
1982 0
1983 0
1984 0
1985 0
1986 0
1987 0
1988 0
1989 0
1990 0
1991 0
1992 0
1993 0
1994 0
1995 0
1996 0
1997 0
1998 0
1999 0
2000 0
2001 0
2002 0
2003 0
2004 0
2005 0
2006 0
2007 0
2008 0
2009 0
2010 0
2011 0
2012 0
2013 0
2014 0

In [40]:
dk=dk.set_index(['year','importer1','importer2'])
dk.to_csv('datab.csv')

other direction flow


In [ ]:
data={}
countries=set()
for i in df.T.iteritems():
    year=int(i[1][2])
    target=cc(i[1][0])
    source=cc(i[1][1])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    countries.add(target)
    countries.add(source)
    if year not in data:data[year]={}
    if ((source in data[year]) and (target in data[year][source])):
            data[year][source][target][1]=value
    else:
        if target not in data[year]:data[year][target]={}
        if source not in data[year][target]:data[year][target][source]=[0,0]
        data[year][target][source][0]=value  
dk=pd.DataFrame(columns=['importer2','importer1','year','flow1','flow2'])
c=0
for year in data:
    for target in data[year]:
        for source in data[year][target]:
            dk.loc[c]=[target,source,year,data[year][target][source][0],data[year][target][source][1]]
            c+=1
    print year,'0'
dk=dk.set_index(['year','importer2','importer1'])
dk.to_csv('data.csv')

extra experimentations


In [225]:
data2={}
countries2=set()
for i in df.T.iteritems():
    year=int(i[1][2])
    target=cc(i[1][0])
    source=cc(i[1][1])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    countries2.add(target)
    countries2.add(source)
    if value<10000:
        target="Other"
    if year not in data2:data2[year]={}
    if ((source in data2[year]) and (target in data2[year][source])):
        data2[year][source][target][1]+=value
    else:
        target=cc(i[1][0])
        source=cc(i[1][1])
        if target=="Various":
            target="Other"
        if source=="Various":
            source="Other"
        if value<10000:
            source="Other"
        if target not in data2[year]:data2[year][target]={}
        if source not in data2[year][target]:data2[year][target][source]=[0,0]
        
        data2[year][target][source][0]+=value

In [226]:
dk2=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
c=0
for year in data2:
    for target in data2[year]:
        for source in data2[year][target]:
            dk2.loc[c]=[target,source,year,data2[year][target][source][0],data2[year][target][source][1]]
            c+=1
    print year,'1'


1975 1
1976 1
1977 1
1978 1
1979 1
1980 1
1981 1
1982 1
1983 1
1984 1
1985 1
1986 1
1987 1
1988 1
1989 1
1990 1
1991 1
1992 1
1993 1
1994 1
1995 1
1996 1
1997 1
1998 1
1999 1
2000 1
2001 1
2002 1
2003 1
2004 1
2005 1
2006 1
2007 1
2008 1
2009 1
2010 1
2011 1
2012 1
2013 1

In [227]:
dk2=dk2.set_index(['year','importer1','importer2'])
dk2.to_csv('data2.csv')

In [262]:
data3={}
for i in df.T.iteritems():
    year=int(i[1][2])
    target=cc(i[1][0])
    source=cc(i[1][1])
    if target=="Various":
        target="Other"
    if source=="Various":
        source="Other"
    value=i[1][3]
    if value<0:
        target="Other"
    if year not in data3:data3[year]={}
    if ((source in data3[year]) and (target in data3[year][source])):
        data3[year][source][target][1]+=value
    else:
        target=cc(i[1][0])
        source=cc(i[1][1])
        if target=="Various":
            target="Other"
        if source=="Various":
            source="Other"
        if value<0:
            source="Other"
        if target not in data3[year]:data3[year][target]={}
        if source not in data3[year][target]:data3[year][target][source]=[0,0]
        
        data3[year][target][source][0]+=value

In [327]:
#for th in {1000,5000,10000,50000,100000}:
for th in {10000,100000}:    
    dk3=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
    c=0    
    for year in data3:
        p=0
        dk3.loc[c]=['Other','Other',year,0,0]
        r=copy.deepcopy(c)
        c+=1
        for target in data3[year]:
            for source in data3[year][target]:
                if ((source!='Other') and (target!='Other')):
                    if ((data3[year][target][source][0]>th)or(data3[year][target][source][1]>th)):
                        dk3.loc[c]=[target,source,year,data3[year][target][source][0],data3[year][target][source][1]]
                    else: p+=data3[year][target][source][0]+data3[year][target][source][1]
                else: dk3.loc[r]['flow1']+=data3[year][target][source][0]+data3[year][target][source][1]
                c+=1
        dk3[(dk3['year']==year)&(dk3['importer2']=='Other')&(dk3['importer1']=='Other')]['flow1']+=p
    dk3=dk3.set_index(['year','importer1','importer2'])
    dk3.to_csv(repr(th)+'data3.csv')
    dk3b=dk3.copy()
    dk3b.index.names=[u'year', u'importer2', u'importer1']
    dk3b.to_csv(repr(th)+'data3b.csv')

In [326]:
dk3.loc[1976]


Out[326]:
flow1 flow2
importer1 importer2
Other Other 2472611 0
Ethiopia Sudan 10600 105000
Tanzania Burundi 126600 0
Rwanda 23600 0
DRC Burundi 18220 0
Rwanda 24500 0
Cameroon Equatorial Guinea 30000 0
Algeria Western Sahara 35000 0
Zambia Angola 27730 0
Senegal Guinea-Bissau 37000 0
Thailand Lao PDR 62720 0
Cambodia 17090 0
Gabon Equatorial Guinea 60000 0
Angola DRC 5000 471340
Côte d'Ivoire Guinea 500000 0
Mozambique Zimbabwe 30000 0
Uganda DRC 34230 0
Rwanda 78480 0
Burundi Rwanda 49500 7500