Scraping Population from Wikipedia


In [1]:
# http://lxml.de/lxmlhtml.html
import requests
from lxml.html import fromstring, parse
from itertools import islice

# http://stackoverflow.com/a/1779324/7782
import locale
locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) 

url = "https://en.wikipedia.org/w/index.php?title=List_of_countries_by_population_(United_Nations)&oldid=590438477"
page = requests.get(url).content.decode("UTF-8")

doc = fromstring(page)

def parse_rank(col):
    try:
        rank = int(col.text)
        return rank
    except:
        return None

def parse_name(col):
    try:
        # find all the anchors and if href points is the form "/wiki"
        name = "; ".join([a.text for a in col.findall(".//a") if a.attrib["href"].startswith("/wiki/")])
        return name
    except:
        return None

def parse_pop(col):
    return locale.atoi(col.text)


def country_by_pop():
    
    for row in islice(doc.xpath("""//*[@id="mw-content-text"]/table[1]/tr"""),2, None):
        cols = row.findall(".//td")
        yield (parse_rank(cols[0]), parse_name(cols[1]), parse_pop(cols[2]))
    
for (i, row) in enumerate(islice(country_by_pop(), None)):
    print i, 
    for col in row:
        if type(col) == 'unicode':
            print col.encode("UTF-8"), 
        else:
            print col, 
    print


0 1 China 1385566537
1 2 India 1252139596
2 3 United States 320050716
3 4 Indonesia 249865631
4 5 Brazil 200361925
5 6 Pakistan 182142594
6 7 Nigeria 173615345
7 8 Bangladesh 156594962
8 9 Russia 142833689
9 10 Japan 127143577
10 11 Mexico 122332399
11 12 Philippines 98393574
12 13 Ethiopia 94100756
13 14 Vietnam 91679733
14 15 Germany 82726626
15 16 Egypt 82056378
16 17 Iran 77447168
17 18 Turkey 74932641
18 19 Congo, Democratic Republic of the 67513677
19 20 Thailand 67010502
20 21 France 64291280
21 22 United Kingdom 63136265
22 23 Italy 60990277
23 24 Myanmar 53259018
24 25 South Africa 52776130
25 26 Korea, South 49262698
26 27 Tanzania 49253126
27 28 Colombia 48321405
28 29 Spain 46926963
29 30 Ukraine 45238805
30 31 Kenya 44353691
31 32 Argentina 41446246
32 33 Algeria 39208194
33 34 Poland 38216635
34 35 Sudan 37964306
35 36 Uganda 37578876
36 37 Canada 35181704
37 38 Iraq 33765232
38 39 Morocco 33008150
39 40 Afghanistan 30551674
40 41 Venezuela 30405207
41 42 Peru 30375603
42 43 Malaysia 29716965
43 44 Uzbekistan 28934102
44 45 Saudi Arabia 28828870
45 46 Nepal 27797457
46 47 Ghana 25904598
47 48 Mozambique 25833752
48 49 Korea, North 24895480
49 50 Yemen 24407381
50 51 Australia 23342553
51 52 Taiwan 23329772
52 53 Madagascar 22924851
53 54 Cameroon 22253959
54 55 Syria 21898061
55 56 Romania 21698585
56 57 Angola 21471618
57 58 Sri Lanka 21273228
58 59 Côte d'Ivoire 20316086
59 60 Niger 17831270
60 61 Chile 17619708
61 62 Burkina Faso 16934839
62 63 Netherlands 16759229
63 64 Kazakhstan 16440586
64 65 Malawi 16362567
65 66 Ecuador 15737878
66 67 Guatemala 15468203
67 68 Mali 15301650
68 69 Cambodia 15135169
69 70 Zambia 14538640
70 71 Zimbabwe 14149648
71 72 Senegal 14133280
72 73 Chad 12825314
73 74 Rwanda 11776522
74 75 Guinea 11745189
75 76 South Sudan 11296173
76 77 Cuba 11265629
77 78 Greece 11127990
78 79 Belgium 11104476
79 80 Tunisia 10996515
80 81 Czech Republic 10702197
81 82 Bolivia 10671200
82 83 Portugal 10608156
83 84 Somalia 10495583
84 85 Dominican Republic 10403761
85 86 Benin 10323474
86 87 Haiti 10317461
87 88 Burundi 10162532
88 89 Hungary 9954941
89 90 Sweden 9571105
90 91 Serbia; Kosovo 9510506
91 92 Azerbaijan 9413420
92 93 Belarus 9356678
93 94 United Arab Emirates 9346129
94 95 Austria 8495145
95 96 Tajikistan 8207834
96 97 Honduras 8097688
97 98 Switzerland 8077833
98 99 Israel 7733144
99 100 Papua New Guinea 7321262
100 101 Jordan 7273799
101 102 Bulgaria 7222943
102 None Hong Kong 7203836
103 103 Togo 6816982
104 104 Paraguay 6802295
105 105 Laos 6769727
106 106 El Salvador 6340454
107 107 Eritrea 6333135
108 108 Libya 6201521
109 109 Sierra Leone 6092075
110 110 Nicaragua 6080478
111 111 Denmark 5619096
112 112 Kyrgyzstan 5547548
113 113 Slovakia 5450223
114 114 Finland 5426323
115 115 Singapore 5411737
116 116 Turkmenistan 5240072
117 117 Norway 5042671
118 118 Costa Rica 4872166
119 119 Lebanon 4821971
120 120 Ireland 4627173
121 121 Central African Republic 4616417
122 122 New Zealand 4505761
123 123 Congo, Republic of the 4447632
124 124 Georgia 4340895
125 125 Palestine 4326295
126 126 Liberia 4294077
127 127 Croatia 4289714
128 128 Mauritania 3889880
129 129 Panama 3864170
130 130 Bosnia and Herzegovina 3829307
131 None Puerto Rico 3688318
132 131 Oman 3632444
133 132 Moldova 3487204
134 133 Uruguay 3407062
135 134 Kuwait 3368572
136 135 Albania 3173271
137 136 Lithuania 3016933
138 137 Armenia 2976566
139 138 Mongolia 2839073
140 139 Jamaica 2783888
141 140 Namibia 2303315
142 141 Qatar 2168673
143 142 Macedonia 2107158
144 143 Lesotho 2074465
145 144 Slovenia 2071997
146 145 Latvia 2050317
147 146 Botswana 2021144
148 147 Gambia 1849285
149 148 Guinea-Bissau 1704255
150 149 Gabon 1671711
151 150 Trinidad and Tobago 1341151
152 151 Bahrain 1332171
153 152 Estonia 1287251
154 153 Swaziland 1249514
155 154 Mauritius 1244403
156 155 Cyprus 1141166
157 156 Timor-Leste 1132879
158 157 Fiji 881065
159 None Réunion 875375
160 158 Djibouti 872932
161 159 Guyana 799613
162 160 Equatorial Guinea 757014
163 161 Bhutan 753947
164 162 Comoros 734917
165 163 Montenegro 621383
166 None Western Sahara 567315
167 None Macau 566375
168 164 Solomon Islands 561231
169 165 Suriname 539276
170 166 Luxembourg 530380
171 167 Cape Verde 498897
172 None Guadeloupe 465800
173 168 Malta 429004
174 169 Brunei 417784
175 None Martinique 403682
176 170 Bahamas 377374
177 171 Maldives 345023
178 172 Belize 331900
179 173 Iceland 329535
180 174 Barbados 284644
181 None French Polynesia 276831
182 None New Caledonia 256496
183 175 Vanuatu 252763
184 None French Guiana 249227
185 None Mayotte 222152
186 176 São Tomé and Príncipe 192993
187 177 Samoa 190372
188 178 Saint Lucia 182273
189 None Guam 165124
190 None Guernsey; Jersey 162018
191 None Curaçao 158760
192 179 Saint Vincent and the Grenadines 109373
193 None Virgin Islands, United States 106627
194 180 Grenada 105897
195 181 Tonga 105323
196 182 Micronesia, Federated States of 103549
197 None Aruba 102911
198 183 Kiribati 102351
199 184 Seychelles 92838
200 185 Antigua and Barbuda 89985
201 None Isle of Man 85888
202 186 Andorra 79218
203 187 Dominica 72003
204 None Bermuda 65341
205 None Cayman Islands 58435
206 None Greenland 56987
207 None American Samoa 55165
208 188 Saint Kitts and Nevis 54191
209 None Northern Mariana Islands 53855
210 189 Marshall Islands 52634
211 None Faroe Islands 49469
212 None Sint Maarten 45233
213 190 Monaco 37831
214 191 Liechtenstein 36925
215 None Turks and Caicos Islands 33098
216 192 San Marino 31448
217 None Gibraltar 29310
218 None Virgin Islands, British 28341
219 193 Palau 20918
220 None Cook Islands 20629
221 None Caribbean Netherlands 19130
222 None Anguilla 14300
223 None Wallis and Futuna 13272
224 194 Nauru 10051
225 195 Tuvalu 9876
226 None Saint Pierre and Miquelon 6043
227 None Montserrat 5091
228 None Saint Helena, Ascension and Tristan da Cunha 4129
229 None Falkland Islands 3044
230 None Niue 1344
231 None Tokelau 1195
232 196 Vatican City 799

In [2]:
import json
s = json.dumps([row for row in country_by_pop()], ensure_ascii=True)

In [ ]:
type(s)

In [ ]:
print s

In [3]:
# https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json

# read population in
import json
import requests

pop_json_url = "https://gist.github.com/rdhyee/8511607/raw/f16257434352916574473e63612fcea55a0c1b1c/population_of_countries.json"
pop_list= requests.get(pop_json_url).json()
pop_list


Out[3]:
[[1, u'China', 1385566537],
 [2, u'India', 1252139596],
 [3, u'United States', 320050716],
 [4, u'Indonesia', 249865631],
 [5, u'Brazil', 200361925],
 [6, u'Pakistan', 182142594],
 [7, u'Nigeria', 173615345],
 [8, u'Bangladesh', 156594962],
 [9, u'Russia', 142833689],
 [10, u'Japan', 127143577],
 [11, u'Mexico', 122332399],
 [12, u'Philippines', 98393574],
 [13, u'Ethiopia', 94100756],
 [14, u'Vietnam', 91679733],
 [15, u'Germany', 82726626],
 [16, u'Egypt', 82056378],
 [17, u'Iran', 77447168],
 [18, u'Turkey', 74932641],
 [19, u'Congo, Democratic Republic of the', 67513677],
 [20, u'Thailand', 67010502],
 [21, u'France', 64291280],
 [22, u'United Kingdom', 63136265],
 [23, u'Italy', 60990277],
 [24, u'Myanmar', 53259018],
 [25, u'South Africa', 52776130],
 [26, u'Korea, South', 49262698],
 [27, u'Tanzania', 49253126],
 [28, u'Colombia', 48321405],
 [29, u'Spain', 46926963],
 [30, u'Ukraine', 45238805],
 [31, u'Kenya', 44353691],
 [32, u'Argentina', 41446246],
 [33, u'Algeria', 39208194],
 [34, u'Poland', 38216635],
 [35, u'Sudan', 37964306],
 [36, u'Uganda', 37578876],
 [37, u'Canada', 35181704],
 [38, u'Iraq', 33765232],
 [39, u'Morocco', 33008150],
 [40, u'Afghanistan', 30551674],
 [41, u'Venezuela', 30405207],
 [42, u'Peru', 30375603],
 [43, u'Malaysia', 29716965],
 [44, u'Uzbekistan', 28934102],
 [45, u'Saudi Arabia', 28828870],
 [46, u'Nepal', 27797457],
 [47, u'Ghana', 25904598],
 [48, u'Mozambique', 25833752],
 [49, u'Korea, North', 24895480],
 [50, u'Yemen', 24407381],
 [51, u'Australia', 23342553],
 [52, u'Taiwan', 23329772],
 [53, u'Madagascar', 22924851],
 [54, u'Cameroon', 22253959],
 [55, u'Syria', 21898061],
 [56, u'Romania', 21698585],
 [57, u'Angola', 21471618],
 [58, u'Sri Lanka', 21273228],
 [59, u"C\xf4te d'Ivoire", 20316086],
 [60, u'Niger', 17831270],
 [61, u'Chile', 17619708],
 [62, u'Burkina Faso', 16934839],
 [63, u'Netherlands', 16759229],
 [64, u'Kazakhstan', 16440586],
 [65, u'Malawi', 16362567],
 [66, u'Ecuador', 15737878],
 [67, u'Guatemala', 15468203],
 [68, u'Mali', 15301650],
 [69, u'Cambodia', 15135169],
 [70, u'Zambia', 14538640],
 [71, u'Zimbabwe', 14149648],
 [72, u'Senegal', 14133280],
 [73, u'Chad', 12825314],
 [74, u'Rwanda', 11776522],
 [75, u'Guinea', 11745189],
 [76, u'South Sudan', 11296173],
 [77, u'Cuba', 11265629],
 [78, u'Greece', 11127990],
 [79, u'Belgium', 11104476],
 [80, u'Tunisia', 10996515],
 [81, u'Czech Republic', 10702197],
 [82, u'Bolivia', 10671200],
 [83, u'Portugal', 10608156],
 [84, u'Somalia', 10495583],
 [85, u'Dominican Republic', 10403761],
 [86, u'Benin', 10323474],
 [87, u'Haiti', 10317461],
 [88, u'Burundi', 10162532],
 [89, u'Hungary', 9954941],
 [90, u'Sweden', 9571105],
 [91, u'Serbia; Kosovo', 9510506],
 [92, u'Azerbaijan', 9413420],
 [93, u'Belarus', 9356678],
 [94, u'United Arab Emirates', 9346129],
 [95, u'Austria', 8495145],
 [96, u'Tajikistan', 8207834],
 [97, u'Honduras', 8097688],
 [98, u'Switzerland', 8077833],
 [99, u'Israel', 7733144],
 [100, u'Papua New Guinea', 7321262],
 [101, u'Jordan', 7273799],
 [102, u'Bulgaria', 7222943],
 [None, u'Hong Kong', 7203836],
 [103, u'Togo', 6816982],
 [104, u'Paraguay', 6802295],
 [105, u'Laos', 6769727],
 [106, u'El Salvador', 6340454],
 [107, u'Eritrea', 6333135],
 [108, u'Libya', 6201521],
 [109, u'Sierra Leone', 6092075],
 [110, u'Nicaragua', 6080478],
 [111, u'Denmark', 5619096],
 [112, u'Kyrgyzstan', 5547548],
 [113, u'Slovakia', 5450223],
 [114, u'Finland', 5426323],
 [115, u'Singapore', 5411737],
 [116, u'Turkmenistan', 5240072],
 [117, u'Norway', 5042671],
 [118, u'Costa Rica', 4872166],
 [119, u'Lebanon', 4821971],
 [120, u'Ireland', 4627173],
 [121, u'Central African Republic', 4616417],
 [122, u'New Zealand', 4505761],
 [123, u'Congo, Republic of the', 4447632],
 [124, u'Georgia', 4340895],
 [125, u'Palestine', 4326295],
 [126, u'Liberia', 4294077],
 [127, u'Croatia', 4289714],
 [128, u'Mauritania', 3889880],
 [129, u'Panama', 3864170],
 [130, u'Bosnia and Herzegovina', 3829307],
 [None, u'Puerto Rico', 3688318],
 [131, u'Oman', 3632444],
 [132, u'Moldova', 3487204],
 [133, u'Uruguay', 3407062],
 [134, u'Kuwait', 3368572],
 [135, u'Albania', 3173271],
 [136, u'Lithuania', 3016933],
 [137, u'Armenia', 2976566],
 [138, u'Mongolia', 2839073],
 [139, u'Jamaica', 2783888],
 [140, u'Namibia', 2303315],
 [141, u'Qatar', 2168673],
 [142, u'Macedonia', 2107158],
 [143, u'Lesotho', 2074465],
 [144, u'Slovenia', 2071997],
 [145, u'Latvia', 2050317],
 [146, u'Botswana', 2021144],
 [147, u'Gambia', 1849285],
 [148, u'Guinea-Bissau', 1704255],
 [149, u'Gabon', 1671711],
 [150, u'Trinidad and Tobago', 1341151],
 [151, u'Bahrain', 1332171],
 [152, u'Estonia', 1287251],
 [153, u'Swaziland', 1249514],
 [154, u'Mauritius', 1244403],
 [155, u'Cyprus', 1141166],
 [156, u'Timor-Leste', 1132879],
 [157, u'Fiji', 881065],
 [None, u'R\xe9union', 875375],
 [158, u'Djibouti', 872932],
 [159, u'Guyana', 799613],
 [160, u'Equatorial Guinea', 757014],
 [161, u'Bhutan', 753947],
 [162, u'Comoros', 734917],
 [163, u'Montenegro', 621383],
 [None, u'Western Sahara', 567315],
 [None, u'Macau', 566375],
 [164, u'Solomon Islands', 561231],
 [165, u'Suriname', 539276],
 [166, u'Luxembourg', 530380],
 [167, u'Cape Verde', 498897],
 [None, u'Guadeloupe', 465800],
 [168, u'Malta', 429004],
 [169, u'Brunei', 417784],
 [None, u'Martinique', 403682],
 [170, u'Bahamas', 377374],
 [171, u'Maldives', 345023],
 [172, u'Belize', 331900],
 [173, u'Iceland', 329535],
 [174, u'Barbados', 284644],
 [None, u'French Polynesia', 276831],
 [None, u'New Caledonia', 256496],
 [175, u'Vanuatu', 252763],
 [None, u'French Guiana', 249227],
 [None, u'Mayotte', 222152],
 [176, u'S\xe3o Tom\xe9 and Pr\xedncipe', 192993],
 [177, u'Samoa', 190372],
 [178, u'Saint Lucia', 182273],
 [None, u'Guam', 165124],
 [None, u'Guernsey; Jersey', 162018],
 [None, u'Cura\xe7ao', 158760],
 [179, u'Saint Vincent and the Grenadines', 109373],
 [None, u'Virgin Islands, United States', 106627],
 [180, u'Grenada', 105897],
 [181, u'Tonga', 105323],
 [182, u'Micronesia, Federated States of', 103549],
 [None, u'Aruba', 102911],
 [183, u'Kiribati', 102351],
 [184, u'Seychelles', 92838],
 [185, u'Antigua and Barbuda', 89985],
 [None, u'Isle of Man', 85888],
 [186, u'Andorra', 79218],
 [187, u'Dominica', 72003],
 [None, u'Bermuda', 65341],
 [None, u'Cayman Islands', 58435],
 [None, u'Greenland', 56987],
 [None, u'American Samoa', 55165],
 [188, u'Saint Kitts and Nevis', 54191],
 [None, u'Northern Mariana Islands', 53855],
 [189, u'Marshall Islands', 52634],
 [None, u'Faroe Islands', 49469],
 [None, u'Sint Maarten', 45233],
 [190, u'Monaco', 37831],
 [191, u'Liechtenstein', 36925],
 [None, u'Turks and Caicos Islands', 33098],
 [192, u'San Marino', 31448],
 [None, u'Gibraltar', 29310],
 [None, u'Virgin Islands, British', 28341],
 [193, u'Palau', 20918],
 [None, u'Cook Islands', 20629],
 [None, u'Caribbean Netherlands', 19130],
 [None, u'Anguilla', 14300],
 [None, u'Wallis and Futuna', 13272],
 [194, u'Nauru', 10051],
 [195, u'Tuvalu', 9876],
 [None, u'Saint Pierre and Miquelon', 6043],
 [None, u'Montserrat', 5091],
 [None, u'Saint Helena, Ascension and Tristan da Cunha', 4129],
 [None, u'Falkland Islands', 3044],
 [None, u'Niue', 1344],
 [None, u'Tokelau', 1195],
 [196, u'Vatican City', 799]]

In [4]:
world_pop = sum([r[2] for r in pop_list])
world_pop


Out[4]:
7162119434L

In [5]:
# http://stackoverflow.com/a/15889203/7782
def cumsum(lis):
    total = 0
    for x in lis:
        total += x
        yield total

In [6]:
cum_pop = list(cumsum((r[2] for r in pop_list)))
cum_pop


Out[6]:
[1385566537,
 2637706133L,
 2957756849L,
 3207622480L,
 3407984405L,
 3590126999L,
 3763742344L,
 3920337306L,
 4063170995L,
 4190314572L,
 4312646971L,
 4411040545L,
 4505141301L,
 4596821034L,
 4679547660L,
 4761604038L,
 4839051206L,
 4913983847L,
 4981497524L,
 5048508026L,
 5112799306L,
 5175935571L,
 5236925848L,
 5290184866L,
 5342960996L,
 5392223694L,
 5441476820L,
 5489798225L,
 5536725188L,
 5581963993L,
 5626317684L,
 5667763930L,
 5706972124L,
 5745188759L,
 5783153065L,
 5820731941L,
 5855913645L,
 5889678877L,
 5922687027L,
 5953238701L,
 5983643908L,
 6014019511L,
 6043736476L,
 6072670578L,
 6101499448L,
 6129296905L,
 6155201503L,
 6181035255L,
 6205930735L,
 6230338116L,
 6253680669L,
 6277010441L,
 6299935292L,
 6322189251L,
 6344087312L,
 6365785897L,
 6387257515L,
 6408530743L,
 6428846829L,
 6446678099L,
 6464297807L,
 6481232646L,
 6497991875L,
 6514432461L,
 6530795028L,
 6546532906L,
 6562001109L,
 6577302759L,
 6592437928L,
 6606976568L,
 6621126216L,
 6635259496L,
 6648084810L,
 6659861332L,
 6671606521L,
 6682902694L,
 6694168323L,
 6705296313L,
 6716400789L,
 6727397304L,
 6738099501L,
 6748770701L,
 6759378857L,
 6769874440L,
 6780278201L,
 6790601675L,
 6800919136L,
 6811081668L,
 6821036609L,
 6830607714L,
 6840118220L,
 6849531640L,
 6858888318L,
 6868234447L,
 6876729592L,
 6884937426L,
 6893035114L,
 6901112947L,
 6908846091L,
 6916167353L,
 6923441152L,
 6930664095L,
 6937867931L,
 6944684913L,
 6951487208L,
 6958256935L,
 6964597389L,
 6970930524L,
 6977132045L,
 6983224120L,
 6989304598L,
 6994923694L,
 7000471242L,
 7005921465L,
 7011347788L,
 7016759525L,
 7021999597L,
 7027042268L,
 7031914434L,
 7036736405L,
 7041363578L,
 7045979995L,
 7050485756L,
 7054933388L,
 7059274283L,
 7063600578L,
 7067894655L,
 7072184369L,
 7076074249L,
 7079938419L,
 7083767726L,
 7087456044L,
 7091088488L,
 7094575692L,
 7097982754L,
 7101351326L,
 7104524597L,
 7107541530L,
 7110518096L,
 7113357169L,
 7116141057L,
 7118444372L,
 7120613045L,
 7122720203L,
 7124794668L,
 7126866665L,
 7128916982L,
 7130938126L,
 7132787411L,
 7134491666L,
 7136163377L,
 7137504528L,
 7138836699L,
 7140123950L,
 7141373464L,
 7142617867L,
 7143759033L,
 7144891912L,
 7145772977L,
 7146648352L,
 7147521284L,
 7148320897L,
 7149077911L,
 7149831858L,
 7150566775L,
 7151188158L,
 7151755473L,
 7152321848L,
 7152883079L,
 7153422355L,
 7153952735L,
 7154451632L,
 7154917432L,
 7155346436L,
 7155764220L,
 7156167902L,
 7156545276L,
 7156890299L,
 7157222199L,
 7157551734L,
 7157836378L,
 7158113209L,
 7158369705L,
 7158622468L,
 7158871695L,
 7159093847L,
 7159286840L,
 7159477212L,
 7159659485L,
 7159824609L,
 7159986627L,
 7160145387L,
 7160254760L,
 7160361387L,
 7160467284L,
 7160572607L,
 7160676156L,
 7160779067L,
 7160881418L,
 7160974256L,
 7161064241L,
 7161150129L,
 7161229347L,
 7161301350L,
 7161366691L,
 7161425126L,
 7161482113L,
 7161537278L,
 7161591469L,
 7161645324L,
 7161697958L,
 7161747427L,
 7161792660L,
 7161830491L,
 7161867416L,
 7161900514L,
 7161931962L,
 7161961272L,
 7161989613L,
 7162010531L,
 7162031160L,
 7162050290L,
 7162064590L,
 7162077862L,
 7162087913L,
 7162097789L,
 7162103832L,
 7162108923L,
 7162113052L,
 7162116096L,
 7162117440L,
 7162118635L,
 7162119434L]

In [7]:
import bisect
import random

In [8]:
# http://docs.python.org/2/library/bisect.html
bisect.bisect_left(cum_pop,world_pop/2)


Out[8]:
5

In [9]:
float(cum_pop[5])/world_pop


Out[9]:
0.5012660054169099

In [10]:
len(cum_pop)


Out[10]:
233

In [11]:
pop_list[0][1]


Out[11]:
u'China'

In [12]:
from itertools import repeat
from collections import Counter

def random_country_weighted_by_pop():
    while True:
        yield pop_list[bisect.bisect_left(cum_pop,random.randint(1,world_pop))][1]
        
Counter(islice(random_country_weighted_by_pop(),5))


Out[12]:
Counter({u'United Kingdom': 1, u'Thailand': 1, u'Tanzania': 1, u'China': 1, u'Bangladesh': 1})

CIA Handbook


In [40]:
import requests
import locale
import json

locale.setlocale( locale.LC_ALL, 'en_US.UTF-8' ) 

cia_url = "https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt"
content = requests.get(cia_url).content

In [14]:
cia_pop_list = [(int(x[0]), x[1], locale.atoi(x[2])) for x in [r.split("\t") for r in content.strip().split("\r")]]
cia_pop_list


Out[14]:
[(1, 'China', 1349585838),
 (2, 'India', 1220800359),
 (3, 'European Union', 509365627),
 (4, 'United States', 316438601),
 (5, 'Indonesia', 251160124),
 (6, 'Brazil', 201009622),
 (7, 'Pakistan', 193238868),
 (8, 'Nigeria', 174507539),
 (9, 'Bangladesh', 163654860),
 (10, 'Russia', 142500482),
 (11, 'Japan', 127253075),
 (12, 'Mexico', 118818228),
 (13, 'Philippines', 105720644),
 (14, 'Ethiopia', 93877025),
 (15, 'Vietnam', 92477857),
 (16, 'Egypt', 85294388),
 (17, 'Germany', 81147265),
 (18, 'Turkey', 80694485),
 (19, 'Iran', 79853900),
 (20, 'Congo, Democratic Republic of the', 75507308),
 (21, 'Thailand', 67497151),
 (22, 'France', 65951611),
 (23, 'United Kingdom', 63395574),
 (24, 'Italy', 61482297),
 (25, 'Burma', 55167330),
 (26, 'Korea, South', 48955203),
 (27, 'South Africa', 48601098),
 (28, 'Tanzania', 48261942),
 (29, 'Spain', 47370542),
 (30, 'Colombia', 45745783),
 (31, 'Ukraine', 44573205),
 (32, 'Kenya', 44037656),
 (33, 'Argentina', 42610981),
 (34, 'Poland', 38383809),
 (35, 'Algeria', 38087812),
 (36, 'Sudan', 34847910),
 (37, 'Uganda', 34758809),
 (38, 'Canada', 34568211),
 (39, 'Morocco', 32649130),
 (40, 'Iraq', 31858481),
 (41, 'Afghanistan', 31108077),
 (42, 'Nepal', 30430267),
 (43, 'Peru', 29849303),
 (44, 'Malaysia', 29628392),
 (45, 'Uzbekistan', 28661637),
 (46, 'Venezuela', 28459085),
 (47, 'Saudi Arabia', 26939583),
 (48, 'Yemen', 25338458),
 (49, 'Ghana', 25199609),
 (50, 'Korea, North', 24720407),
 (51, 'Mozambique', 24096669),
 (52, 'Taiwan', 23299716),
 (53, 'Madagascar', 22599098),
 (54, 'Cameroon', 22534532),
 (55, 'Syria', 22457336),
 (56, "Cote d'Ivoire", 22400835),
 (57, 'Australia', 22262501),
 (58, 'Romania', 21790479),
 (59, 'Sri Lanka', 21675648),
 (60, 'Angola', 18565269),
 (61, 'Burkina Faso', 17812961),
 (62, 'Kazakhstan', 17736896),
 (63, 'Chile', 17216945),
 (64, 'Niger', 16899327),
 (65, 'Netherlands', 16805037),
 (66, 'Malawi', 16777547),
 (67, 'Mali', 15968882),
 (68, 'Ecuador', 15439429),
 (69, 'Cambodia', 15205539),
 (70, 'Guatemala', 14373472),
 (71, 'Zambia', 14222233),
 (72, 'Senegal', 13300410),
 (73, 'Zimbabwe', 13182908),
 (74, 'Rwanda', 12012589),
 (75, 'Chad', 11193452),
 (76, 'Guinea', 11176026),
 (77, 'South Sudan', 11090104),
 (78, 'Cuba', 11061886),
 (79, 'Tunisia', 10835873),
 (80, 'Portugal', 10799270),
 (81, 'Greece', 10772967),
 (82, 'Czech Republic', 10609762),
 (83, 'Bolivia', 10461053),
 (84, 'Belgium', 10444268),
 (85, 'Somalia', 10251568),
 (86, 'Dominican Republic', 10219630),
 (87, 'Hungary', 9939470),
 (88, 'Haiti', 9893934),
 (89, 'Benin', 9877292),
 (90, 'Sweden', 9647386),
 (91, 'Belarus', 9625888),
 (92, 'Azerbaijan', 9590159),
 (93, 'Honduras', 8448465),
 (94, 'Austria', 8221646),
 (95, 'Switzerland', 7996026),
 (96, 'Tajikistan', 7910041),
 (97, 'Israel', 7707042),
 (98, 'Serbia', 7243007),
 (99, 'Togo', 7154237),
 (100, 'Hong Kong', 7082316),
 (101, 'Bulgaria', 6981642),
 (102, 'Laos', 6695166),
 (103, 'Paraguay', 6623252),
 (104, 'Jordan', 6482081),
 (105, 'Papua New Guinea', 6431902),
 (106, 'Eritrea', 6233682),
 (107, 'El Salvador', 6108590),
 (108, 'Libya', 6002347),
 (109, 'Nicaragua', 5788531),
 (110, 'Sierra Leone', 5612685),
 (111, 'Denmark', 5556452),
 (112, 'Kyrgyzstan', 5548042),
 (113, 'Slovakia', 5488339),
 (114, 'United Arab Emirates', 5473972),
 (115, 'Singapore', 5460302),
 (116, 'Finland', 5266114),
 (117, 'Central African Republic', 5166510),
 (118, 'Turkmenistan', 5113040),
 (119, 'Norway', 5085582),
 (120, 'Georgia', 4942157),
 (121, 'Ireland', 4775982),
 (122, 'Costa Rica', 4695942),
 (123, 'Congo, Republic of the', 4574099),
 (124, 'Croatia', 4475611),
 (125, 'New Zealand', 4365113),
 (126, 'Lebanon', 4131583),
 (127, 'Liberia', 3989703),
 (128, 'Bosnia and Herzegovina', 3875723),
 (129, 'Puerto Rico', 3645648),
 (130, 'Moldova', 3619925),
 (131, 'Panama', 3559408),
 (132, 'Lithuania', 3515858),
 (133, 'Mauritania', 3437610),
 (134, 'Uruguay', 3324460),
 (135, 'Oman', 3154134),
 (136, 'Armenia', 3064267),
 (137, 'Albania', 3011405),
 (138, 'Mongolia', 2912192),
 (139, 'Jamaica', 2909714),
 (140, 'Kuwait', 2695316),
 (141, 'West Bank', 2676740),
 (142, 'Namibia', 2182852),
 (143, 'Latvia', 2178443),
 (144, 'Botswana', 2127825),
 (145, 'Macedonia', 2087171),
 (146, 'Qatar', 2042444),
 (147, 'Slovenia', 1992690),
 (148, 'Lesotho', 1936181),
 (149, 'Gambia, The', 1883051),
 (150, 'Kosovo', 1847708),
 (151, 'Gaza Strip', 1763387),
 (152, 'Guinea-Bissau', 1660870),
 (153, 'Gabon', 1640286),
 (154, 'Swaziland', 1403362),
 (155, 'Mauritius', 1322238),
 (156, 'Bahrain', 1281332),
 (157, 'Estonia', 1266375),
 (158, 'Trinidad and Tobago', 1225225),
 (159, 'Timor-Leste', 1172390),
 (160, 'Cyprus', 1155403),
 (161, 'Burundi', 1060714),
 (162, 'Fiji', 896758),
 (163, 'Djibouti', 792198),
 (164, 'Comoros', 752288),
 (165, 'Guyana', 739903),
 (166, 'Bhutan', 725296),
 (167, 'Equatorial Guinea', 704001),
 (168, 'Montenegro', 653474),
 (169, 'Solomon Islands', 597248),
 (170, 'Macau', 583003),
 (171, 'Suriname', 566846),
 (172, 'Western Sahara', 538811),
 (173, 'Cabo Verde', 531046),
 (174, 'Luxembourg', 514862),
 (175, 'Brunei', 415717),
 (176, 'Malta', 411277),
 (177, 'Maldives', 393988),
 (178, 'Belize', 334297),
 (179, 'Bahamas, The', 319031),
 (180, 'Iceland', 315281),
 (181, 'Barbados', 288725),
 (182, 'French Polynesia', 277293),
 (183, 'New Caledonia', 264022),
 (184, 'Vanuatu', 261565),
 (185, 'Samoa', 195476),
 (186, 'Sao Tome and Principe', 186817),
 (187, 'Saint Lucia', 162781),
 (188, 'Guam', 160378),
 (189, 'Curacao', 146836),
 (190, 'Grenada', 109590),
 (191, 'Aruba', 109153),
 (192, 'Tonga', 106322),
 (193, 'Micronesia, Federated States of', 106104),
 (194, 'Virgin Islands', 104737),
 (195, 'Kiribati', 103248),
 (196, 'Saint Vincent and the Grenadines', 103220),
 (197, 'Jersey', 95732),
 (198, 'Seychelles', 90846),
 (199, 'Antigua and Barbuda', 90156),
 (200, 'Isle of Man', 86159),
 (201, 'Andorra', 85293),
 (202, 'Dominica', 73286),
 (203, 'Marshall Islands', 69747),
 (204, 'Bermuda', 69467),
 (205, 'Guernsey', 65605),
 (206, 'Greenland', 57714),
 (207, 'American Samoa', 54719),
 (208, 'Cayman Islands', 53737),
 (209, 'Northern Mariana Islands', 51170),
 (210, 'Saint Kitts and Nevis', 51134),
 (211, 'Faroe Islands', 49709),
 (212, 'Turks and Caicos Islands', 47754),
 (213, 'Sint Maarten', 39689),
 (214, 'Liechtenstein', 37009),
 (215, 'San Marino', 32448),
 (216, 'British Virgin Islands', 31912),
 (217, 'Saint Martin', 31264),
 (218, 'Monaco', 30500),
 (219, 'Gibraltar', 29111),
 (220, 'Palau', 21108),
 (221, 'Anguilla', 15754),
 (222, 'Dhekelia', 15700),
 (223, 'Akrotiri', 15700),
 (224, 'Wallis and Futuna', 15507),
 (225, 'Tuvalu', 10698),
 (226, 'Cook Islands', 10447),
 (227, 'Nauru', 9434),
 (228, 'Saint Helena, Ascension, and Tristan da Cunha', 7754),
 (229, 'Saint Barthelemy', 7298),
 (230, 'Saint Pierre and Miquelon', 5774),
 (231, 'Montserrat', 5189),
 (232, 'Falkland Islands (Islas Malvinas)', 3140),
 (233, 'Norfolk Island', 2196),
 (234, 'Svalbard', 1921),
 (235, 'Christmas Island', 1513),
 (236, 'Tokelau', 1353),
 (237, 'Niue', 1229),
 (238, 'Holy See (Vatican City)', 839),
 (239, 'Cocos (Keeling) Islands', 596),
 (240, 'Pitcairn Islands', 65)]

In [15]:
print json.dumps(cia_pop_list)


[[1, "China", 1349585838], [2, "India", 1220800359], [3, "European Union", 509365627], [4, "United States", 316438601], [5, "Indonesia", 251160124], [6, "Brazil", 201009622], [7, "Pakistan", 193238868], [8, "Nigeria", 174507539], [9, "Bangladesh", 163654860], [10, "Russia", 142500482], [11, "Japan", 127253075], [12, "Mexico", 118818228], [13, "Philippines", 105720644], [14, "Ethiopia", 93877025], [15, "Vietnam", 92477857], [16, "Egypt", 85294388], [17, "Germany", 81147265], [18, "Turkey", 80694485], [19, "Iran", 79853900], [20, "Congo, Democratic Republic of the", 75507308], [21, "Thailand", 67497151], [22, "France", 65951611], [23, "United Kingdom", 63395574], [24, "Italy", 61482297], [25, "Burma", 55167330], [26, "Korea, South", 48955203], [27, "South Africa", 48601098], [28, "Tanzania", 48261942], [29, "Spain", 47370542], [30, "Colombia", 45745783], [31, "Ukraine", 44573205], [32, "Kenya", 44037656], [33, "Argentina", 42610981], [34, "Poland", 38383809], [35, "Algeria", 38087812], [36, "Sudan", 34847910], [37, "Uganda", 34758809], [38, "Canada", 34568211], [39, "Morocco", 32649130], [40, "Iraq", 31858481], [41, "Afghanistan", 31108077], [42, "Nepal", 30430267], [43, "Peru", 29849303], [44, "Malaysia", 29628392], [45, "Uzbekistan", 28661637], [46, "Venezuela", 28459085], [47, "Saudi Arabia", 26939583], [48, "Yemen", 25338458], [49, "Ghana", 25199609], [50, "Korea, North", 24720407], [51, "Mozambique", 24096669], [52, "Taiwan", 23299716], [53, "Madagascar", 22599098], [54, "Cameroon", 22534532], [55, "Syria", 22457336], [56, "Cote d'Ivoire", 22400835], [57, "Australia", 22262501], [58, "Romania", 21790479], [59, "Sri Lanka", 21675648], [60, "Angola", 18565269], [61, "Burkina Faso", 17812961], [62, "Kazakhstan", 17736896], [63, "Chile", 17216945], [64, "Niger", 16899327], [65, "Netherlands", 16805037], [66, "Malawi", 16777547], [67, "Mali", 15968882], [68, "Ecuador", 15439429], [69, "Cambodia", 15205539], [70, "Guatemala", 14373472], [71, "Zambia", 14222233], [72, "Senegal", 13300410], [73, "Zimbabwe", 13182908], [74, "Rwanda", 12012589], [75, "Chad", 11193452], [76, "Guinea", 11176026], [77, "South Sudan", 11090104], [78, "Cuba", 11061886], [79, "Tunisia", 10835873], [80, "Portugal", 10799270], [81, "Greece", 10772967], [82, "Czech Republic", 10609762], [83, "Bolivia", 10461053], [84, "Belgium", 10444268], [85, "Somalia", 10251568], [86, "Dominican Republic", 10219630], [87, "Hungary", 9939470], [88, "Haiti", 9893934], [89, "Benin", 9877292], [90, "Sweden", 9647386], [91, "Belarus", 9625888], [92, "Azerbaijan", 9590159], [93, "Honduras", 8448465], [94, "Austria", 8221646], [95, "Switzerland", 7996026], [96, "Tajikistan", 7910041], [97, "Israel", 7707042], [98, "Serbia", 7243007], [99, "Togo", 7154237], [100, "Hong Kong", 7082316], [101, "Bulgaria", 6981642], [102, "Laos", 6695166], [103, "Paraguay", 6623252], [104, "Jordan", 6482081], [105, "Papua New Guinea", 6431902], [106, "Eritrea", 6233682], [107, "El Salvador", 6108590], [108, "Libya", 6002347], [109, "Nicaragua", 5788531], [110, "Sierra Leone", 5612685], [111, "Denmark", 5556452], [112, "Kyrgyzstan", 5548042], [113, "Slovakia", 5488339], [114, "United Arab Emirates", 5473972], [115, "Singapore", 5460302], [116, "Finland", 5266114], [117, "Central African Republic", 5166510], [118, "Turkmenistan", 5113040], [119, "Norway", 5085582], [120, "Georgia", 4942157], [121, "Ireland", 4775982], [122, "Costa Rica", 4695942], [123, "Congo, Republic of the", 4574099], [124, "Croatia", 4475611], [125, "New Zealand", 4365113], [126, "Lebanon", 4131583], [127, "Liberia", 3989703], [128, "Bosnia and Herzegovina", 3875723], [129, "Puerto Rico", 3645648], [130, "Moldova", 3619925], [131, "Panama", 3559408], [132, "Lithuania", 3515858], [133, "Mauritania", 3437610], [134, "Uruguay", 3324460], [135, "Oman", 3154134], [136, "Armenia", 3064267], [137, "Albania", 3011405], [138, "Mongolia", 2912192], [139, "Jamaica", 2909714], [140, "Kuwait", 2695316], [141, "West Bank", 2676740], [142, "Namibia", 2182852], [143, "Latvia", 2178443], [144, "Botswana", 2127825], [145, "Macedonia", 2087171], [146, "Qatar", 2042444], [147, "Slovenia", 1992690], [148, "Lesotho", 1936181], [149, "Gambia, The", 1883051], [150, "Kosovo", 1847708], [151, "Gaza Strip", 1763387], [152, "Guinea-Bissau", 1660870], [153, "Gabon", 1640286], [154, "Swaziland", 1403362], [155, "Mauritius", 1322238], [156, "Bahrain", 1281332], [157, "Estonia", 1266375], [158, "Trinidad and Tobago", 1225225], [159, "Timor-Leste", 1172390], [160, "Cyprus", 1155403], [161, "Burundi", 1060714], [162, "Fiji", 896758], [163, "Djibouti", 792198], [164, "Comoros", 752288], [165, "Guyana", 739903], [166, "Bhutan", 725296], [167, "Equatorial Guinea", 704001], [168, "Montenegro", 653474], [169, "Solomon Islands", 597248], [170, "Macau", 583003], [171, "Suriname", 566846], [172, "Western Sahara", 538811], [173, "Cabo Verde", 531046], [174, "Luxembourg", 514862], [175, "Brunei", 415717], [176, "Malta", 411277], [177, "Maldives", 393988], [178, "Belize", 334297], [179, "Bahamas, The", 319031], [180, "Iceland", 315281], [181, "Barbados", 288725], [182, "French Polynesia", 277293], [183, "New Caledonia", 264022], [184, "Vanuatu", 261565], [185, "Samoa", 195476], [186, "Sao Tome and Principe", 186817], [187, "Saint Lucia", 162781], [188, "Guam", 160378], [189, "Curacao", 146836], [190, "Grenada", 109590], [191, "Aruba", 109153], [192, "Tonga", 106322], [193, "Micronesia, Federated States of", 106104], [194, "Virgin Islands", 104737], [195, "Kiribati", 103248], [196, "Saint Vincent and the Grenadines", 103220], [197, "Jersey", 95732], [198, "Seychelles", 90846], [199, "Antigua and Barbuda", 90156], [200, "Isle of Man", 86159], [201, "Andorra", 85293], [202, "Dominica", 73286], [203, "Marshall Islands", 69747], [204, "Bermuda", 69467], [205, "Guernsey", 65605], [206, "Greenland", 57714], [207, "American Samoa", 54719], [208, "Cayman Islands", 53737], [209, "Northern Mariana Islands", 51170], [210, "Saint Kitts and Nevis", 51134], [211, "Faroe Islands", 49709], [212, "Turks and Caicos Islands", 47754], [213, "Sint Maarten", 39689], [214, "Liechtenstein", 37009], [215, "San Marino", 32448], [216, "British Virgin Islands", 31912], [217, "Saint Martin", 31264], [218, "Monaco", 30500], [219, "Gibraltar", 29111], [220, "Palau", 21108], [221, "Anguilla", 15754], [222, "Dhekelia", 15700], [223, "Akrotiri", 15700], [224, "Wallis and Futuna", 15507], [225, "Tuvalu", 10698], [226, "Cook Islands", 10447], [227, "Nauru", 9434], [228, "Saint Helena, Ascension, and Tristan da Cunha", 7754], [229, "Saint Barthelemy", 7298], [230, "Saint Pierre and Miquelon", 5774], [231, "Montserrat", 5189], [232, "Falkland Islands (Islas Malvinas)", 3140], [233, "Norfolk Island", 2196], [234, "Svalbard", 1921], [235, "Christmas Island", 1513], [236, "Tokelau", 1353], [237, "Niue", 1229], [238, "Holy See (Vatican City)", 839], [239, "Cocos (Keeling) Islands", 596], [240, "Pitcairn Islands", 65]]

In [41]:
# https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json
# https://gist.github.com/rdhyee/8530164
# https://www.cia.gov/library/publications/the-world-factbook/rankorder/2119rank.html
# https://www.cia.gov/library/publications/the-world-factbook/rankorder/rawdata_2119.txt


import json
import requests

cia_json_url = "https://gist.github.com/rdhyee/8530164/raw/f8e842fe8ccd6e3bc424e3a24e41ef5c38f419e8/world_factbook_poulation.json"
cia_list= requests.get(cia_json_url).json()
cia_list


Out[41]:
[[1, u'China', 1349585838],
 [2, u'India', 1220800359],
 [3, u'European Union', 509365627],
 [4, u'United States', 316438601],
 [5, u'Indonesia', 251160124],
 [6, u'Brazil', 201009622],
 [7, u'Pakistan', 193238868],
 [8, u'Nigeria', 174507539],
 [9, u'Bangladesh', 163654860],
 [10, u'Russia', 142500482],
 [11, u'Japan', 127253075],
 [12, u'Mexico', 118818228],
 [13, u'Philippines', 105720644],
 [14, u'Ethiopia', 93877025],
 [15, u'Vietnam', 92477857],
 [16, u'Egypt', 85294388],
 [17, u'Germany', 81147265],
 [18, u'Turkey', 80694485],
 [19, u'Iran', 79853900],
 [20, u'Congo, Democratic Republic of the', 75507308],
 [21, u'Thailand', 67497151],
 [22, u'France', 65951611],
 [23, u'United Kingdom', 63395574],
 [24, u'Italy', 61482297],
 [25, u'Burma', 55167330],
 [26, u'Korea, South', 48955203],
 [27, u'South Africa', 48601098],
 [28, u'Tanzania', 48261942],
 [29, u'Spain', 47370542],
 [30, u'Colombia', 45745783],
 [31, u'Ukraine', 44573205],
 [32, u'Kenya', 44037656],
 [33, u'Argentina', 42610981],
 [34, u'Poland', 38383809],
 [35, u'Algeria', 38087812],
 [36, u'Sudan', 34847910],
 [37, u'Uganda', 34758809],
 [38, u'Canada', 34568211],
 [39, u'Morocco', 32649130],
 [40, u'Iraq', 31858481],
 [41, u'Afghanistan', 31108077],
 [42, u'Nepal', 30430267],
 [43, u'Peru', 29849303],
 [44, u'Malaysia', 29628392],
 [45, u'Uzbekistan', 28661637],
 [46, u'Venezuela', 28459085],
 [47, u'Saudi Arabia', 26939583],
 [48, u'Yemen', 25338458],
 [49, u'Ghana', 25199609],
 [50, u'Korea, North', 24720407],
 [51, u'Mozambique', 24096669],
 [52, u'Taiwan', 23299716],
 [53, u'Madagascar', 22599098],
 [54, u'Cameroon', 22534532],
 [55, u'Syria', 22457336],
 [56, u"Cote d'Ivoire", 22400835],
 [57, u'Australia', 22262501],
 [58, u'Romania', 21790479],
 [59, u'Sri Lanka', 21675648],
 [60, u'Angola', 18565269],
 [61, u'Burkina Faso', 17812961],
 [62, u'Kazakhstan', 17736896],
 [63, u'Chile', 17216945],
 [64, u'Niger', 16899327],
 [65, u'Netherlands', 16805037],
 [66, u'Malawi', 16777547],
 [67, u'Mali', 15968882],
 [68, u'Ecuador', 15439429],
 [69, u'Cambodia', 15205539],
 [70, u'Guatemala', 14373472],
 [71, u'Zambia', 14222233],
 [72, u'Senegal', 13300410],
 [73, u'Zimbabwe', 13182908],
 [74, u'Rwanda', 12012589],
 [75, u'Chad', 11193452],
 [76, u'Guinea', 11176026],
 [77, u'South Sudan', 11090104],
 [78, u'Cuba', 11061886],
 [79, u'Tunisia', 10835873],
 [80, u'Portugal', 10799270],
 [81, u'Greece', 10772967],
 [82, u'Czech Republic', 10609762],
 [83, u'Bolivia', 10461053],
 [84, u'Belgium', 10444268],
 [85, u'Somalia', 10251568],
 [86, u'Dominican Republic', 10219630],
 [87, u'Hungary', 9939470],
 [88, u'Haiti', 9893934],
 [89, u'Benin', 9877292],
 [90, u'Sweden', 9647386],
 [91, u'Belarus', 9625888],
 [92, u'Azerbaijan', 9590159],
 [93, u'Honduras', 8448465],
 [94, u'Austria', 8221646],
 [95, u'Switzerland', 7996026],
 [96, u'Tajikistan', 7910041],
 [97, u'Israel', 7707042],
 [98, u'Serbia', 7243007],
 [99, u'Togo', 7154237],
 [100, u'Hong Kong', 7082316],
 [101, u'Bulgaria', 6981642],
 [102, u'Laos', 6695166],
 [103, u'Paraguay', 6623252],
 [104, u'Jordan', 6482081],
 [105, u'Papua New Guinea', 6431902],
 [106, u'Eritrea', 6233682],
 [107, u'El Salvador', 6108590],
 [108, u'Libya', 6002347],
 [109, u'Nicaragua', 5788531],
 [110, u'Sierra Leone', 5612685],
 [111, u'Denmark', 5556452],
 [112, u'Kyrgyzstan', 5548042],
 [113, u'Slovakia', 5488339],
 [114, u'United Arab Emirates', 5473972],
 [115, u'Singapore', 5460302],
 [116, u'Finland', 5266114],
 [117, u'Central African Republic', 5166510],
 [118, u'Turkmenistan', 5113040],
 [119, u'Norway', 5085582],
 [120, u'Georgia', 4942157],
 [121, u'Ireland', 4775982],
 [122, u'Costa Rica', 4695942],
 [123, u'Congo, Republic of the', 4574099],
 [124, u'Croatia', 4475611],
 [125, u'New Zealand', 4365113],
 [126, u'Lebanon', 4131583],
 [127, u'Liberia', 3989703],
 [128, u'Bosnia and Herzegovina', 3875723],
 [129, u'Puerto Rico', 3645648],
 [130, u'Moldova', 3619925],
 [131, u'Panama', 3559408],
 [132, u'Lithuania', 3515858],
 [133, u'Mauritania', 3437610],
 [134, u'Uruguay', 3324460],
 [135, u'Oman', 3154134],
 [136, u'Armenia', 3064267],
 [137, u'Albania', 3011405],
 [138, u'Mongolia', 2912192],
 [139, u'Jamaica', 2909714],
 [140, u'Kuwait', 2695316],
 [141, u'West Bank', 2676740],
 [142, u'Namibia', 2182852],
 [143, u'Latvia', 2178443],
 [144, u'Botswana', 2127825],
 [145, u'Macedonia', 2087171],
 [146, u'Qatar', 2042444],
 [147, u'Slovenia', 1992690],
 [148, u'Lesotho', 1936181],
 [149, u'Gambia, The', 1883051],
 [150, u'Kosovo', 1847708],
 [151, u'Gaza Strip', 1763387],
 [152, u'Guinea-Bissau', 1660870],
 [153, u'Gabon', 1640286],
 [154, u'Swaziland', 1403362],
 [155, u'Mauritius', 1322238],
 [156, u'Bahrain', 1281332],
 [157, u'Estonia', 1266375],
 [158, u'Trinidad and Tobago', 1225225],
 [159, u'Timor-Leste', 1172390],
 [160, u'Cyprus', 1155403],
 [161, u'Burundi', 1060714],
 [162, u'Fiji', 896758],
 [163, u'Djibouti', 792198],
 [164, u'Comoros', 752288],
 [165, u'Guyana', 739903],
 [166, u'Bhutan', 725296],
 [167, u'Equatorial Guinea', 704001],
 [168, u'Montenegro', 653474],
 [169, u'Solomon Islands', 597248],
 [170, u'Macau', 583003],
 [171, u'Suriname', 566846],
 [172, u'Western Sahara', 538811],
 [173, u'Cabo Verde', 531046],
 [174, u'Luxembourg', 514862],
 [175, u'Brunei', 415717],
 [176, u'Malta', 411277],
 [177, u'Maldives', 393988],
 [178, u'Belize', 334297],
 [179, u'Bahamas, The', 319031],
 [180, u'Iceland', 315281],
 [181, u'Barbados', 288725],
 [182, u'French Polynesia', 277293],
 [183, u'New Caledonia', 264022],
 [184, u'Vanuatu', 261565],
 [185, u'Samoa', 195476],
 [186, u'Sao Tome and Principe', 186817],
 [187, u'Saint Lucia', 162781],
 [188, u'Guam', 160378],
 [189, u'Curacao', 146836],
 [190, u'Grenada', 109590],
 [191, u'Aruba', 109153],
 [192, u'Tonga', 106322],
 [193, u'Micronesia, Federated States of', 106104],
 [194, u'Virgin Islands', 104737],
 [195, u'Kiribati', 103248],
 [196, u'Saint Vincent and the Grenadines', 103220],
 [197, u'Jersey', 95732],
 [198, u'Seychelles', 90846],
 [199, u'Antigua and Barbuda', 90156],
 [200, u'Isle of Man', 86159],
 [201, u'Andorra', 85293],
 [202, u'Dominica', 73286],
 [203, u'Marshall Islands', 69747],
 [204, u'Bermuda', 69467],
 [205, u'Guernsey', 65605],
 [206, u'Greenland', 57714],
 [207, u'American Samoa', 54719],
 [208, u'Cayman Islands', 53737],
 [209, u'Northern Mariana Islands', 51170],
 [210, u'Saint Kitts and Nevis', 51134],
 [211, u'Faroe Islands', 49709],
 [212, u'Turks and Caicos Islands', 47754],
 [213, u'Sint Maarten', 39689],
 [214, u'Liechtenstein', 37009],
 [215, u'San Marino', 32448],
 [216, u'British Virgin Islands', 31912],
 [217, u'Saint Martin', 31264],
 [218, u'Monaco', 30500],
 [219, u'Gibraltar', 29111],
 [220, u'Palau', 21108],
 [221, u'Anguilla', 15754],
 [222, u'Dhekelia', 15700],
 [223, u'Akrotiri', 15700],
 [224, u'Wallis and Futuna', 15507],
 [225, u'Tuvalu', 10698],
 [226, u'Cook Islands', 10447],
 [227, u'Nauru', 9434],
 [228, u'Saint Helena, Ascension, and Tristan da Cunha', 7754],
 [229, u'Saint Barthelemy', 7298],
 [230, u'Saint Pierre and Miquelon', 5774],
 [231, u'Montserrat', 5189],
 [232, u'Falkland Islands (Islas Malvinas)', 3140],
 [233, u'Norfolk Island', 2196],
 [234, u'Svalbard', 1921],
 [235, u'Christmas Island', 1513],
 [236, u'Tokelau', 1353],
 [237, u'Niue', 1229],
 [238, u'Holy See (Vatican City)', 839],
 [239, u'Cocos (Keeling) Islands', 596],
 [240, u'Pitcairn Islands', 65]]

In [43]:
cia_world_pop = sum([r[2] for r in cia_list if r[1] != 'European Union'])
cia_world_pop


Out[43]:
7091218583L

In [44]:
cia_world_pop, world_pop, cia_world_pop/float(world_pop)


Out[44]:
(7091218583L, 7162119434L, 0.9901005768399478)

Comparing two lists


In [46]:
# set of entities for Wikipedia
wk_entities = set([r[1] for r in pop_list])
wk_entities


Out[46]:
{u'Afghanistan',
 u'Albania',
 u'Algeria',
 u'American Samoa',
 u'Andorra',
 u'Angola',
 u'Anguilla',
 u'Antigua and Barbuda',
 u'Argentina',
 u'Armenia',
 u'Aruba',
 u'Australia',
 u'Austria',
 u'Azerbaijan',
 u'Bahamas',
 u'Bahrain',
 u'Bangladesh',
 u'Barbados',
 u'Belarus',
 u'Belgium',
 u'Belize',
 u'Benin',
 u'Bermuda',
 u'Bhutan',
 u'Bolivia',
 u'Bosnia and Herzegovina',
 u'Botswana',
 u'Brazil',
 u'Brunei',
 u'Bulgaria',
 u'Burkina Faso',
 u'Burundi',
 u'Cambodia',
 u'Cameroon',
 u'Canada',
 u'Cape Verde',
 u'Caribbean Netherlands',
 u'Cayman Islands',
 u'Central African Republic',
 u'Chad',
 u'Chile',
 u'China',
 u'Colombia',
 u'Comoros',
 u'Congo, Democratic Republic of the',
 u'Congo, Republic of the',
 u'Cook Islands',
 u'Costa Rica',
 u'Croatia',
 u'Cuba',
 u'Cura\xe7ao',
 u'Cyprus',
 u'Czech Republic',
 u"C\xf4te d'Ivoire",
 u'Denmark',
 u'Djibouti',
 u'Dominica',
 u'Dominican Republic',
 u'Ecuador',
 u'Egypt',
 u'El Salvador',
 u'Equatorial Guinea',
 u'Eritrea',
 u'Estonia',
 u'Ethiopia',
 u'Falkland Islands',
 u'Faroe Islands',
 u'Fiji',
 u'Finland',
 u'France',
 u'French Guiana',
 u'French Polynesia',
 u'Gabon',
 u'Gambia',
 u'Georgia',
 u'Germany',
 u'Ghana',
 u'Gibraltar',
 u'Greece',
 u'Greenland',
 u'Grenada',
 u'Guadeloupe',
 u'Guam',
 u'Guatemala',
 u'Guernsey; Jersey',
 u'Guinea',
 u'Guinea-Bissau',
 u'Guyana',
 u'Haiti',
 u'Honduras',
 u'Hong Kong',
 u'Hungary',
 u'Iceland',
 u'India',
 u'Indonesia',
 u'Iran',
 u'Iraq',
 u'Ireland',
 u'Isle of Man',
 u'Israel',
 u'Italy',
 u'Jamaica',
 u'Japan',
 u'Jordan',
 u'Kazakhstan',
 u'Kenya',
 u'Kiribati',
 u'Korea, North',
 u'Korea, South',
 u'Kuwait',
 u'Kyrgyzstan',
 u'Laos',
 u'Latvia',
 u'Lebanon',
 u'Lesotho',
 u'Liberia',
 u'Libya',
 u'Liechtenstein',
 u'Lithuania',
 u'Luxembourg',
 u'Macau',
 u'Macedonia',
 u'Madagascar',
 u'Malawi',
 u'Malaysia',
 u'Maldives',
 u'Mali',
 u'Malta',
 u'Marshall Islands',
 u'Martinique',
 u'Mauritania',
 u'Mauritius',
 u'Mayotte',
 u'Mexico',
 u'Micronesia, Federated States of',
 u'Moldova',
 u'Monaco',
 u'Mongolia',
 u'Montenegro',
 u'Montserrat',
 u'Morocco',
 u'Mozambique',
 u'Myanmar',
 u'Namibia',
 u'Nauru',
 u'Nepal',
 u'Netherlands',
 u'New Caledonia',
 u'New Zealand',
 u'Nicaragua',
 u'Niger',
 u'Nigeria',
 u'Niue',
 u'Northern Mariana Islands',
 u'Norway',
 u'Oman',
 u'Pakistan',
 u'Palau',
 u'Palestine',
 u'Panama',
 u'Papua New Guinea',
 u'Paraguay',
 u'Peru',
 u'Philippines',
 u'Poland',
 u'Portugal',
 u'Puerto Rico',
 u'Qatar',
 u'Romania',
 u'Russia',
 u'Rwanda',
 u'R\xe9union',
 u'Saint Helena, Ascension and Tristan da Cunha',
 u'Saint Kitts and Nevis',
 u'Saint Lucia',
 u'Saint Pierre and Miquelon',
 u'Saint Vincent and the Grenadines',
 u'Samoa',
 u'San Marino',
 u'Saudi Arabia',
 u'Senegal',
 u'Serbia; Kosovo',
 u'Seychelles',
 u'Sierra Leone',
 u'Singapore',
 u'Sint Maarten',
 u'Slovakia',
 u'Slovenia',
 u'Solomon Islands',
 u'Somalia',
 u'South Africa',
 u'South Sudan',
 u'Spain',
 u'Sri Lanka',
 u'Sudan',
 u'Suriname',
 u'Swaziland',
 u'Sweden',
 u'Switzerland',
 u'Syria',
 u'S\xe3o Tom\xe9 and Pr\xedncipe',
 u'Taiwan',
 u'Tajikistan',
 u'Tanzania',
 u'Thailand',
 u'Timor-Leste',
 u'Togo',
 u'Tokelau',
 u'Tonga',
 u'Trinidad and Tobago',
 u'Tunisia',
 u'Turkey',
 u'Turkmenistan',
 u'Turks and Caicos Islands',
 u'Tuvalu',
 u'Uganda',
 u'Ukraine',
 u'United Arab Emirates',
 u'United Kingdom',
 u'United States',
 u'Uruguay',
 u'Uzbekistan',
 u'Vanuatu',
 u'Vatican City',
 u'Venezuela',
 u'Vietnam',
 u'Virgin Islands, British',
 u'Virgin Islands, United States',
 u'Wallis and Futuna',
 u'Western Sahara',
 u'Yemen',
 u'Zambia',
 u'Zimbabwe'}

In [47]:
cia_entities = set([r[1] for r in cia_list])

In [48]:
len(wk_entities), len(cia_entities)


Out[48]:
(233, 240)

In [49]:
# http://docs.python.org/2/library/stdtypes.html#set
# intersection
len(wk_entities & cia_entities)


Out[49]:
212

In [37]:
# symmetric diff
wk_entities ^ cia_entities


Out[37]:
{u'Akrotiri',
 u'Bahamas',
 u'Bahamas, The',
 u'British Virgin Islands',
 u'Burma',
 u'Cabo Verde',
 u'Cape Verde',
 u'Caribbean Netherlands',
 u'Christmas Island',
 u'Cocos (Keeling) Islands',
 u"Cote d'Ivoire",
 u'Curacao',
 u'Cura\xe7ao',
 u"C\xf4te d'Ivoire",
 u'Dhekelia',
 u'European Union',
 u'Falkland Islands',
 u'Falkland Islands (Islas Malvinas)',
 u'French Guiana',
 u'Gambia',
 u'Gambia, The',
 u'Gaza Strip',
 u'Guadeloupe',
 u'Guernsey',
 u'Guernsey; Jersey',
 u'Holy See (Vatican City)',
 u'Jersey',
 u'Kosovo',
 u'Martinique',
 u'Mayotte',
 u'Myanmar',
 u'Norfolk Island',
 u'Palestine',
 u'Pitcairn Islands',
 u'R\xe9union',
 u'Saint Barthelemy',
 u'Saint Helena, Ascension and Tristan da Cunha',
 u'Saint Helena, Ascension, and Tristan da Cunha',
 u'Saint Martin',
 u'Sao Tome and Principe',
 u'Serbia',
 u'Serbia; Kosovo',
 u'Svalbard',
 u'S\xe3o Tom\xe9 and Pr\xedncipe',
 u'Vatican City',
 u'Virgin Islands',
 u'Virgin Islands, British',
 u'Virgin Islands, United States',
 u'West Bank'}

In [38]:
wk_entities - cia_entities


Out[38]:
{u'Bahamas',
 u'Cape Verde',
 u'Caribbean Netherlands',
 u'Cura\xe7ao',
 u"C\xf4te d'Ivoire",
 u'Falkland Islands',
 u'French Guiana',
 u'Gambia',
 u'Guadeloupe',
 u'Guernsey; Jersey',
 u'Martinique',
 u'Mayotte',
 u'Myanmar',
 u'Palestine',
 u'R\xe9union',
 u'Saint Helena, Ascension and Tristan da Cunha',
 u'Serbia; Kosovo',
 u'S\xe3o Tom\xe9 and Pr\xedncipe',
 u'Vatican City',
 u'Virgin Islands, British',
 u'Virgin Islands, United States'}

In [39]:
cia_entities - wk_entities


Out[39]:
{u'Akrotiri',
 u'Bahamas, The',
 u'British Virgin Islands',
 u'Burma',
 u'Cabo Verde',
 u'Christmas Island',
 u'Cocos (Keeling) Islands',
 u"Cote d'Ivoire",
 u'Curacao',
 u'Dhekelia',
 u'European Union',
 u'Falkland Islands (Islas Malvinas)',
 u'Gambia, The',
 u'Gaza Strip',
 u'Guernsey',
 u'Holy See (Vatican City)',
 u'Jersey',
 u'Kosovo',
 u'Norfolk Island',
 u'Pitcairn Islands',
 u'Saint Barthelemy',
 u'Saint Helena, Ascension, and Tristan da Cunha',
 u'Saint Martin',
 u'Sao Tome and Principe',
 u'Serbia',
 u'Svalbard',
 u'Virgin Islands',
 u'West Bank'}

In [34]:
len(wk_entities or cia_entities)


Out[34]:
233