In [1]:
import pandas as pd, copy
In [2]:
#UNDATA dbd 1975-2013
#http://data.un.org/Data.aspx?d=UNHCR&f=indID%3AType-Ref
#doanload data in batches, query limited to 50.000
df1=pd.read_csv('1985.csv') #years 1975-1984
df2=pd.read_csv('1995.csv') #years 1985-1994
df3=pd.read_csv('2005.csv') #years 1995-2004
df4=pd.read_csv('2015.csv') #years 2005-2013
In [ ]:
#UNHCR db 2001-2014
#http://popstats.unhcr.org/en/overview#_ga=1.40654370.1278371767.1434418671 - select time series
df=pd.read_csv('all_data.csv',skiprows=3)
In [30]:
#UNHCR db 1951-2014
#http://popstats.unhcr.org/en/overview#_ga=1.40654370.1278371767.1434418671 - person of concern
df=pd.read_csv('all_data2.csv',skiprows=3)
In [31]:
df
Out[31]:
In [32]:
df.columns=['year','target','source','a','b','c','d','e','f','g','value']
df=df.drop(['a','b','c','d','e','f','g',],axis=1).dropna()#.set_index(['year','target','source'])[:80440]
df.head(50)
Out[32]:
In [6]:
df.set_index(['year','target','source']).loc[2014].loc['Pakistan']
Out[6]:
palestinian data, from world bank
In [7]:
import numpy as np
def interpolate(d,years,gfit=2,depth=1,polyorder=1,override=True):
#depth * length of interpolation substrings will be taken to the left and right
#for example for {1971:5,1972:6,1973:7,1974:5} interpolating it over 1969-1990
#for the section 1960-1970 (2 elements) the values from 1972,1973,1974 (3 elements) will be taken with depth 1.5
#for the section 1974-1990 (15 elements) all values (4 elements) will be taken to extrapolate
if (gfit>2):
print 'interpolate takes only 1 (polynomial) or 2 (exponential) as 3rd argument [default=2]'
return
mydict={}
missing_points=[[]]
for year in years:
if year not in d.keys():
missing_points[-1].append(year)
else:
missing_points.append([])
for m in missing_points:
if m:
fit=gfit
if ((m[-1]<np.sort(d.keys())[0])|(m[0]>np.sort(d.keys())[-1])): #check if it is ends of the interval, then extrapolate mean only
if not override: fit=0
if fit==0: #take average
y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(3)),min(max(years),max(m)+int(3))+1))}
for i in range(len(m)):
mydict[m[i]]=np.mean(y.values())
elif fit==1:
#intersector
y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))}
#print y
w = np.polyfit(y.keys(),y.values(),polyorder) # obtaining regression parameters
if (polyorder==1):
intersector=w[0]*np.array(m)+w[1]
else:
intersector=w[0]*np.array(m)*np.array(m)+w[1]*np.array(m)+w[2]
for i in range(len(m)):
mydict[m[i]]=max(0,intersector[i])
else:
#intersector
y = {k: d[k] for k in set(d.keys()).intersection(range(max(min(years),min(m)-int(depth*len(m))),min(max(years),max(m)+int(depth*len(m)))+1))}
#print y
w = np.polyfit(y.keys(),np.log(y.values()),1) # obtaining log regression parameters (exp fitting)
intersector=np.exp(w[1])*np.exp(w[0]*np.array(m))
for i in range(len(m)):
mydict[m[i]]=max(0,intersector[i])
#return interpolated points
return mydict
In [19]:
#http://data.worldbank.org/indicator/SM.POP.REFG/countries?display=default
dz=pd.read_csv('pal.csv',skiprows=2)
dz.columns=['country','cc','a','b']+range(1960,2016)
dz=dz.set_index('country').drop(['cc','a','b']+range(1960,1975)+range(2014,2016),axis=1)
In [20]:
ccc={'Syrian Arab Republic':'Syria','West Bank and Gaza':'Palestine','Jordan':'Jordan','Lebanon':'Lebanon'}
ccr={'Syrian Arab Republic':'Syrian Arab Rep.','West Bank and Gaza':'State of Palestine','Jordan':'Jordan','Lebanon':'Lebanon'}
In [21]:
dx=pd.concat([pd.DataFrame(dz.loc['Jordan']).T,pd.DataFrame(dz.loc['Lebanon']).T\
,pd.DataFrame(dz.loc['Syrian Arab Republic']).T,pd.DataFrame(dz.loc['West Bank and Gaza']).T])
In [33]:
#reset df, without setting index to run this parts
palref=['Jordan','Lebanon','Syrian Arab Republic','West Bank and Gaza']
dc=pd.DataFrame(columns=['year','target','source','value'])
for k in range(4):
hp={}
for i in dx.loc[palref[k]].iteritems():
if ~np.isnan(i[1]):
hp[i[0]]=i[1]
hp.update(interpolate(hp,range(1951,2015)))
for y in hp:
val=hp[y]-df[((df['target']==ccr[palref[k]])&(df['year']==y))].sum()[3]
dc.loc[y+k*1000]=[y,ccc[palref[k]],'Palestine',val]
append plaestine data to main dataframe
In [34]:
df=pd.concat([df,dc])
save data
In [37]:
def cc(country):
if country in cc2:
return cc2[country]
else: return country
cc2={
'Bolivia (Plurinational State of)':'Bolivia',
'Micronesia (Federated States of)':'Micronesia',
'Serbia (and Kosovo: S/RES/1244 (1999))':'Serbia & Kosovo',
'The former Yugoslav Rep. of Macedonia':'FYROM',
'Venezuela (Bolivarian Republic of)':'Venezuela',
'Dem. Rep. of the Congo':'DRC',
'Central African Rep.':'CAR',
"Dem. People's Rep. of Korea":"North Korea",
'Islamic Rep. of Iran':'Iran',
"Lao People's Dem. Rep.":'Lao PDR',
'Papua New Guinea':'PNG',
'Syrian Arab Rep.':'Syria',
'United Rep. of Tanzania':'Tanzania',
'United Arab Emirates':'UAE',
'Antigua and Barbuda':'Antig. & Barb',
'Bosnia and Herzegovina':'Bosnia & Herz.',
'British Virgin Islands':'UK Virgin',
'Brunei Darussalam':'Brunei',
'Hong Kong SAR, China':'Hong Kong',
'Macao SAR, China':'Macao',
'Russian Federation':'Russia',
'Saint Kitts and Nevis':'St. Kitts & Nev.',
'Saint Vincent and the Grenadines':'St. Vinc. & Gren.',
'Sint Maarten (Dutch part)':'St. Maarten',
'State of Palestine':'Palestine',
'Trinidad and Tobago':'Trinid. & Tob.',
'Turks and Caicos Islands':'Turks & Caicos'
}
In [38]:
data={}
countries=set()
for i in df.T.iteritems():
year=int(i[1][0])
target=cc(i[1][1])
source=cc(i[1][2])
if target=="Various":
target="Other"
if source=="Various":
source="Other"
value=i[1][3]
countries.add(target)
countries.add(source)
if year not in data:data[year]={}
if ((source in data[year]) and (target in data[year][source])):
data[year][source][target][1]=value
else:
if target not in data[year]:data[year][target]={}
if source not in data[year][target]:data[year][target][source]=[0,0]
data[year][target][source][0]=value
In [39]:
dk=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
c=0
for year in data:
for target in data[year]:
for source in data[year][target]:
dk.loc[c]=[target,source,year,data[year][target][source][0],data[year][target][source][1]]
c+=1
print year,'0'
In [40]:
dk=dk.set_index(['year','importer1','importer2'])
dk.to_csv('datab.csv')
other direction flow
In [ ]:
data={}
countries=set()
for i in df.T.iteritems():
year=int(i[1][2])
target=cc(i[1][0])
source=cc(i[1][1])
if target=="Various":
target="Other"
if source=="Various":
source="Other"
value=i[1][3]
countries.add(target)
countries.add(source)
if year not in data:data[year]={}
if ((source in data[year]) and (target in data[year][source])):
data[year][source][target][1]=value
else:
if target not in data[year]:data[year][target]={}
if source not in data[year][target]:data[year][target][source]=[0,0]
data[year][target][source][0]=value
dk=pd.DataFrame(columns=['importer2','importer1','year','flow1','flow2'])
c=0
for year in data:
for target in data[year]:
for source in data[year][target]:
dk.loc[c]=[target,source,year,data[year][target][source][0],data[year][target][source][1]]
c+=1
print year,'0'
dk=dk.set_index(['year','importer2','importer1'])
dk.to_csv('data.csv')
extra experimentations
In [225]:
data2={}
countries2=set()
for i in df.T.iteritems():
year=int(i[1][2])
target=cc(i[1][0])
source=cc(i[1][1])
if target=="Various":
target="Other"
if source=="Various":
source="Other"
value=i[1][3]
countries2.add(target)
countries2.add(source)
if value<10000:
target="Other"
if year not in data2:data2[year]={}
if ((source in data2[year]) and (target in data2[year][source])):
data2[year][source][target][1]+=value
else:
target=cc(i[1][0])
source=cc(i[1][1])
if target=="Various":
target="Other"
if source=="Various":
source="Other"
if value<10000:
source="Other"
if target not in data2[year]:data2[year][target]={}
if source not in data2[year][target]:data2[year][target][source]=[0,0]
data2[year][target][source][0]+=value
In [226]:
dk2=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
c=0
for year in data2:
for target in data2[year]:
for source in data2[year][target]:
dk2.loc[c]=[target,source,year,data2[year][target][source][0],data2[year][target][source][1]]
c+=1
print year,'1'
In [227]:
dk2=dk2.set_index(['year','importer1','importer2'])
dk2.to_csv('data2.csv')
In [262]:
data3={}
for i in df.T.iteritems():
year=int(i[1][2])
target=cc(i[1][0])
source=cc(i[1][1])
if target=="Various":
target="Other"
if source=="Various":
source="Other"
value=i[1][3]
if value<0:
target="Other"
if year not in data3:data3[year]={}
if ((source in data3[year]) and (target in data3[year][source])):
data3[year][source][target][1]+=value
else:
target=cc(i[1][0])
source=cc(i[1][1])
if target=="Various":
target="Other"
if source=="Various":
source="Other"
if value<0:
source="Other"
if target not in data3[year]:data3[year][target]={}
if source not in data3[year][target]:data3[year][target][source]=[0,0]
data3[year][target][source][0]+=value
In [327]:
#for th in {1000,5000,10000,50000,100000}:
for th in {10000,100000}:
dk3=pd.DataFrame(columns=['importer1','importer2','year','flow1','flow2'])
c=0
for year in data3:
p=0
dk3.loc[c]=['Other','Other',year,0,0]
r=copy.deepcopy(c)
c+=1
for target in data3[year]:
for source in data3[year][target]:
if ((source!='Other') and (target!='Other')):
if ((data3[year][target][source][0]>th)or(data3[year][target][source][1]>th)):
dk3.loc[c]=[target,source,year,data3[year][target][source][0],data3[year][target][source][1]]
else: p+=data3[year][target][source][0]+data3[year][target][source][1]
else: dk3.loc[r]['flow1']+=data3[year][target][source][0]+data3[year][target][source][1]
c+=1
dk3[(dk3['year']==year)&(dk3['importer2']=='Other')&(dk3['importer1']=='Other')]['flow1']+=p
dk3=dk3.set_index(['year','importer1','importer2'])
dk3.to_csv(repr(th)+'data3.csv')
dk3b=dk3.copy()
dk3b.index.names=[u'year', u'importer2', u'importer1']
dk3b.to_csv(repr(th)+'data3b.csv')
In [326]:
dk3.loc[1976]
Out[326]: