Homework 8: Dataset 3: Refugees


In [6]:
# datasource: https://data.humdata.org/dataset/unhcr-refugee-pop-stats/resource/fbacbba3-1b20-4331-931b-6a21a4cb80f5
# dataset is labeled "Group of concern to UNHCR"

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
df = pd.read_csv('refugee_data.csv')

In [3]:
df.head(20)


Out[3]:
Country Origin_Returned_from Population type 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
0 Afghanistan Afghanistan Internally displaced 758625.0 1200000.0 665156.0 184269.0 159549.0 142505.0 129310.0 153718.0 230670.0 297129.0 351907.0 447547.0 486298.0 631286.0
1 Afghanistan Afghanistan Returned IDPs NaN NaN 753344.0 82067.0 27391.0 17044.0 10443.0 8012.0 6453.0 7225.0 3366.0 75453.0 18830.0 21830.0
2 Afghanistan Afghanistan Others of concern NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 838250.0 951167.0 879376.0 275486.0
3 Afghanistan Antigua and Barbuda Returned refugees NaN NaN NaN NaN NaN NaN NaN 4.0 NaN NaN NaN NaN NaN NaN
4 Afghanistan Australia Returned refugees NaN NaN NaN NaN NaN 4.0 NaN 4.0 1.0 1.0 NaN NaN NaN NaN
5 Afghanistan Austria Returned refugees NaN NaN NaN NaN NaN 25.0 22.0 2.0 8.0 NaN NaN NaN NaN NaN
6 Afghanistan Azerbaijan Returned refugees NaN NaN 19.0 25.0 3.0 22.0 1.0 6.0 NaN NaN 10.0 8.0 2.0 17.0
7 Afghanistan Bangladesh Returned refugees NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
8 Afghanistan Belarus Returned refugees NaN NaN NaN 6.0 20.0 24.0 15.0 12.0 NaN 1.0 NaN NaN NaN NaN
9 Afghanistan Belgium Returned refugees NaN NaN NaN NaN NaN 11.0 6.0 6.0 15.0 1.0 NaN NaN NaN NaN
10 Afghanistan Brazil Returned refugees NaN NaN NaN 13.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN
11 Afghanistan Bulgaria Returned refugees NaN NaN NaN 7.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
12 Afghanistan Cambodia Returned refugees NaN NaN NaN 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
13 Afghanistan Chile Returned refugees NaN NaN NaN NaN 2.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN
14 Afghanistan China Asylum seekers NaN NaN NaN 4.0 NaN 2.0 NaN NaN NaN NaN NaN NaN NaN NaN
15 Afghanistan Denmark Returned refugees NaN NaN NaN NaN NaN 65.0 5.0 4.0 1.0 NaN NaN NaN NaN NaN
16 Afghanistan Egypt Returned refugees NaN NaN 6.0 4.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN
17 Afghanistan Eritrea Refugees NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN
18 Afghanistan Eritrea Asylum seekers NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN
19 Afghanistan Ethiopia Returned refugees NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN

1) What different population types are of concern for the UNHCR?


In [4]:
df['Population type'].unique()


Out[4]:
array(['Internally displaced', 'Returned IDPs', 'Others of concern',
       'Returned refugees', 'Asylum seekers', 'Refugees', 'Stateless'], dtype=object)

2)-8) For each of these population types, which were the 20 country hosting most in 2013?

except: "Others of concern"

@TAs: This question might actually count as six questions, right? ;)


In [5]:
df.columns


Out[5]:
Index(['Country', 'Origin_Returned_from', 'Population type', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013'],
      dtype='object')

In [6]:
recent = df[['Country', 'Origin_Returned_from', 'Population type','2013']]
def population_type_count(a):
    a = recent[recent['Population type'] == a]
    a.groupby('Country')['2013'].sum()
    a_table = pd.DataFrame(a.groupby('Country')['2013'].sum())
    return a_table.sort_values(by='2013', ascending = False).head(20)

TOP20 most refugees


In [7]:
population_type_count('Refugees')


Out[7]:
2013
Country
Pakistan 1616495.0
Islamic Republic of Iran 857352.0
Lebanon 856529.0
Jordan 641894.0
Turkey 609911.0
Kenya 534920.0
Chad 434461.0
Ethiopia 433923.0
China 301045.0
United States 263618.0
Iraq 246294.0
Yemen 241276.0
France 232487.0
Bangladesh 231137.0
Egypt 230055.0
South Sudan 229587.0
Uganda 220538.0
Bolivarian Republic of Venezuela 204291.0
India 188371.0
Germany 187505.0

TOP 20 most asylum seekers


In [8]:
population_type_count('Asylum seekers')


Out[8]:
2013
Country
South Africa 232149.0
Germany 135510.0
United States 84302.0
Turkey 52419.0
Kenya 52270.0
France 51683.0
Greece 49787.0
Malaysia 43005.0
Sweden 27664.0
Uganda 24206.0
Egypt 23105.0
United Kingdom 22990.0
Austria 22681.0
Canada 22110.0
Switzerland 22075.0
Angola 20026.0
Italy 13620.0
Australia 13473.0
Ecuador 12416.0
Belgium 11439.0

TOP20 Internally displaced


In [9]:
population_type_count('Internally displaced')


Out[9]:
2013
Country
Syrian Arab Republic 6520800.0
Colombia 5368138.0
Democratic Republic of the Congo 2963799.0
Sudan 1873300.0
Somalia 1133000.0
Iraq 954128.0
Central African Republic 894421.0
Pakistan 747498.0
Afghanistan 631286.0
Azerbaijan 609029.0
Myanmar 372000.0
South Sudan 331097.0
Yemen 306614.0
Georgia 257611.0
Mali 254822.0
Serbia (and Kosovo: S/RES/1244 (1999)) 227495.0
Philippines 117369.0
Bosnia and Herzegovina 84500.0
Burundi 78948.0
Zimbabwe 60139.0

TOP20 most stateless


In [10]:
population_type_count('Stateless')


Out[10]:
2013
Country
Myanmar 810000.0
Côte d'Ivoire 700000.0
Thailand 506197.0
Latvia 267789.0
Dominican Republic 210000.0
Russian Federation 178000.0
Syrian Arab Republic 160000.0
Iraq 120000.0
Kuwait 93000.0
Estonia 91281.0
Saudi Arabia 70000.0
Malaysia 40000.0
Ukraine 33271.0
Brunei Darussalam 20524.0
Sweden 20450.0
Kenya 20000.0
Germany 11709.0
Kyrgyzstan 11425.0
Viet Nam 11000.0
Poland 10825.0

TOP20 most Returned IDPs


In [11]:
population_type_count('Returned IDPs')


Out[11]:
2013
Country
Democratic Republic of the Congo 595200.0
Philippines 211607.0
Somalia 104706.0
Yemen 93055.0
Pakistan 90637.0
Iraq 63270.0
Mali 42253.0
Sri Lanka 40691.0
Myanmar 27383.0
Afghanistan 21830.0
Côte d'Ivoire 21000.0
Sudan 19471.0
Bosnia and Herzegovina 18949.0
Libya 5350.0
Serbia (and Kosovo: S/RES/1244 (1999)) 780.0
Angola NaN
Burundi NaN
Central African Republic NaN
Chad NaN
Congo NaN

TOP20 most Returned refugees


In [12]:
population_type_count('Returned refugees')


Out[12]:
2013
Country
Syrian Arab Republic 140756.0
Democratic Republic of the Congo 68417.0
Iraq 60880.0
Afghanistan 39665.0
Somalia 36100.0
Côte d'Ivoire 20021.0
Sudan 16949.0
Mali 14278.0
Rwanda 7791.0
Myanmar 3005.0
Burundi 2114.0
Angola 1665.0
Sri Lanka 913.0
Croatia 503.0
South Sudan 392.0
Chad 384.0
Serbia (and Kosovo: S/RES/1244 (1999)) 168.0
Bosnia and Herzegovina 121.0
Togo 108.0
Liberia 83.0

9) Which were the TOP10 countries, most refugees returned home from in 2013?


In [13]:
recent.columns


Out[13]:
Index(['Country', 'Origin_Returned_from', 'Population type', '2013'], dtype='object')

In [14]:
returned_refugees = recent[recent['Population type'] == 'Returned refugees']
returned_refugees.groupby('Origin_Returned_from')['2013'].sum()
returned_table = pd.DataFrame(returned_refugees.groupby('Origin_Returned_from')['2013'].sum())
returned_table.sort_values(by='2013', ascending = False).head(10)


Out[14]:
2013
Origin_Returned_from
Turkey 140881.0
Congo 62935.0
Syrian Arab Republic 45848.0
Pakistan 31224.0
Kenya 28828.0
Liberia 18719.0
Chad 16935.0
Islamic Republic of Iran 12282.0
Democratic Republic of the Congo 8697.0
Niger 5694.0

10) Which country of origin were most asylum seekers from in 2013?


In [15]:
asylum_seekers = recent[recent['Population type'] == 'Asylum seekers']
seekers_table = pd.DataFrame(asylum_seekers.groupby('Origin_Returned_from')['2013'].sum())
seekers_table.sort_values(by='2013', ascending = False).head(10)


Out[15]:
2013
Origin_Returned_from
Various 165057.0
Afghanistan 75273.0
Democratic Republic of the Congo 64008.0
Ethiopia 48590.0
Pakistan 46468.0
Myanmar 45020.0
Zimbabwe 43402.0
Iraq 43109.0
Syrian Arab Republic 40993.0
Somalia 35445.0

11) What is the overall number of (asylum seekers/refugees/idp) for each year?


In [16]:
df.columns


Out[16]:
Index(['Country', 'Origin_Returned_from', 'Population type', '2000', '2001',
       '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010',
       '2011', '2012', '2013'],
      dtype='object')

In [33]:
years_available=['2000', '2001','2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013']
pop_types=['Asylum seekers', 'Refugees', 'Internally displaced']
totals_dict_list=[]

for poptype in pop_types:
    poptype_dictionary ={}
    for year in years_available:
        poptype_only = df[(df['Population type'] == poptype) & (df[year].notnull())]
        poptype_per_year = poptype_only[year].sum()
        print(year, 'there were in total', poptype_per_year, poptype)
        poptype_dictionary[year] = poptype_per_year
    totals_dict_list.append(poptype_dictionary)

totals_dict_list


2000 there were in total 947926.0 Asylum seekers
2001 there were in total 943854.0 Asylum seekers
2002 there were in total 1093755.0 Asylum seekers
2003 there were in total 996428.0 Asylum seekers
2004 there were in total 885249.0 Asylum seekers
2005 there were in total 802174.0 Asylum seekers
2006 there were in total 741291.0 Asylum seekers
2007 there were in total 741110.0 Asylum seekers
2008 there were in total 825043.0 Asylum seekers
2009 there were in total 989169.0 Asylum seekers
2010 there were in total 837445.0 Asylum seekers
2011 there were in total 895284.0 Asylum seekers
2012 there were in total 928226.0 Asylum seekers
2013 there were in total 1168273.0 Asylum seekers
2000 there were in total 12129572.0 Refugees
2001 there were in total 12116835.0 Refugees
2002 there were in total 10594055.0 Refugees
2003 there were in total 9592795.0 Refugees
2004 there were in total 9573397.0 Refugees
2005 there were in total 8661988.0 Refugees
2006 there were in total 9877703.0 Refugees
2007 there were in total 11390930.0 Refugees
2008 there were in total 10489812.0 Refugees
2009 there were in total 10396538.0 Refugees
2010 there were in total 10549679.0 Refugees
2011 there were in total 10404806.0 Refugees
2012 there were in total 10497957.0 Refugees
2013 there were in total 11699638.0 Refugees
2000 there were in total 5998501.0 Internally displaced
2001 there were in total 5096502.0 Internally displaced
2002 there were in total 4646641.0 Internally displaced
2003 there were in total 4181701.0 Internally displaced
2004 there were in total 5426539.0 Internally displaced
2005 there were in total 6616791.0 Internally displaced
2006 there were in total 12794268.0 Internally displaced
2007 there were in total 13740317.0 Internally displaced
2008 there were in total 14442227.0 Internally displaced
2009 there were in total 15628057.0 Internally displaced
2010 there were in total 14697804.0 Internally displaced
2011 there were in total 15473378.0 Internally displaced
2012 there were in total 17670368.0 Internally displaced
2013 there were in total 23925555.0 Internally displaced
Out[33]:
[{'2000': 947926.0,
  '2001': 943854.0,
  '2002': 1093755.0,
  '2003': 996428.0,
  '2004': 885249.0,
  '2005': 802174.0,
  '2006': 741291.0,
  '2007': 741110.0,
  '2008': 825043.0,
  '2009': 989169.0,
  '2010': 837445.0,
  '2011': 895284.0,
  '2012': 928226.0,
  '2013': 1168273.0},
 {'2000': 12129572.0,
  '2001': 12116835.0,
  '2002': 10594055.0,
  '2003': 9592795.0,
  '2004': 9573397.0,
  '2005': 8661988.0,
  '2006': 9877703.0,
  '2007': 11390930.0,
  '2008': 10489812.0,
  '2009': 10396538.0,
  '2010': 10549679.0,
  '2011': 10404806.0,
  '2012': 10497957.0,
  '2013': 11699638.0},
 {'2000': 5998501.0,
  '2001': 5096502.0,
  '2002': 4646641.0,
  '2003': 4181701.0,
  '2004': 5426539.0,
  '2005': 6616791.0,
  '2006': 12794268.0,
  '2007': 13740317.0,
  '2008': 14442227.0,
  '2009': 15628057.0,
  '2010': 14697804.0,
  '2011': 15473378.0,
  '2012': 17670368.0,
  '2013': 23925555.0}]

12) Line graph or stacked bar chart for 11)


In [34]:
asylum_over_time = totals_dict_list[0]
asylums_table = pd.DataFrame(asylum_over_time, index=['Total asylum seekers per year'])
asylums_table
#asylums_table.plot(kind='bar')


Out[34]:
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
Total asylum seekers per year 947926.0 943854.0 1093755.0 996428.0 885249.0 802174.0 741291.0 741110.0 825043.0 989169.0 837445.0 895284.0 928226.0 1168273.0

In [35]:
refugees_over_time = totals_dict_list[1]
refugees_table = pd.DataFrame(refugees_over_time, index=['Total refugees per year'])
refugees_table


Out[35]:
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
Total refugees per year 12129572.0 12116835.0 10594055.0 9592795.0 9573397.0 8661988.0 9877703.0 11390930.0 10489812.0 10396538.0 10549679.0 10404806.0 10497957.0 11699638.0

In [36]:
idp_over_time = totals_dict_list[2]
idps_table = pd.DataFrame(idp_over_time, index=['Total IDPs per year'])
idps_table


Out[36]:
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
Total IDPs per year 5998501.0 5096502.0 4646641.0 4181701.0 5426539.0 6616791.0 12794268.0 13740317.0 14442227.0 15628057.0 14697804.0 15473378.0 17670368.0 23925555.0

In [65]:
#CONCATENATE DID NOT WORK, BUT KEEPING THIS NOTES AS MEMORY

#asylums_table.plot(kind='bar')
#asylum_over_time = totals_dict_list[0]
#asylums_table = pd.DataFrame(asylum_over_time, index=['Total asylum seekers per year'])
#asylums_table
#pd.concat(totals_dict_list[0], axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, copy=True)


#http://stackoverflow.com/questions/31974548/take-a-row-from-one-dataframe-and-insert-into-first-row-of-another-dataframe-in
#A-> dataframe with (v,w,x,y,z) columns ( Some values)
#b -> dataframe with (v,w,x,y,z) columns ( All values)
#b = pd.concat([A[A.v==1],b])

#asylums_table = pd.concat([idps_table[idps_table.v==1],asylums_table])

In [50]:
two_table = asylums_table.append(refugees_table)
two_table


Out[50]:
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
Total asylum seekers per year 947926.0 943854.0 1093755.0 996428.0 885249.0 802174.0 741291.0 741110.0 825043.0 989169.0 837445.0 895284.0 928226.0 1168273.0
Total refugees per year 12129572.0 12116835.0 10594055.0 9592795.0 9573397.0 8661988.0 9877703.0 11390930.0 10489812.0 10396538.0 10549679.0 10404806.0 10497957.0 11699638.0

In [52]:
totals_table = two_table.append(idps_table)
totals_table
# http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.append.html


Out[52]:
2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
Total asylum seekers per year 947926.0 943854.0 1093755.0 996428.0 885249.0 802174.0 741291.0 741110.0 825043.0 989169.0 837445.0 895284.0 928226.0 1168273.0
Total refugees per year 12129572.0 12116835.0 10594055.0 9592795.0 9573397.0 8661988.0 9877703.0 11390930.0 10489812.0 10396538.0 10549679.0 10404806.0 10497957.0 11699638.0
Total IDPs per year 5998501.0 5096502.0 4646641.0 4181701.0 5426539.0 6616791.0 12794268.0 13740317.0 14442227.0 15628057.0 14697804.0 15473378.0 17670368.0 23925555.0

In [71]:
#totals_table.plot()
plt.style.use("ggplot")
totals_table2 = totals_table.T
totals_table2.plot(figsize=(10,7), ylim=(0,25000000), linewidth=3, y)
#http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.plot


Out[71]:
<matplotlib.axes._subplots.AxesSubplot at 0x110858cc0>

In [ ]: