HCDS_(Fall_2017)

A2 Assignments

A2: Bias in data

by Abhishek Anand

The goal for project is to explore the concept of 'bias' through data on Wikipedia articles .

Step 1 : Getting the article and population data

Source :

* Wikipedia Data Set : Politicians by Country from the English-language Wikipedia  [https://figshare.com/articles/Untitled_Item/5513449] : File Name  - page_data.csv  
    1. "country", containing the sanitised country name, extracted from the category name;
    2. "page", containing the unsanitised page title.
    3. "last_edit", containing the edit ID of the last edit to the page.
* Population Data : http://www.prb.org/DataFinder/Topic/Rankings.aspx?ind=14   File Name  - opulation Mid-2015.csv
* Article quality predictions : https://ores.wikimedia.org/v3/#!/scoring/get_v3_scores_context_revid_model 

In [2]:
## getting the data from the CSV files
import csv

In [3]:
# Data Source 1 : 
# reading data and saving in dictionary of dictionaries
'''
{
    "revision_id": {
        "country": "",
        "article_name": "",
        "revision_id":"",
    },
    "revision_id": {
    
    },
    ...
}
'''

page_data = dict()
skip_lines = 1

with open('page_data.csv', encoding="utf8") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        #page_data.append([row[0],row[1],row[2]])
        #['Template:ZambiaProvincialMinisters', 'Zambia', '235107991']
        if(skip_lines!=1):
            revision_id = row[2]
            page_data[revision_id]=dict()
            page_data[revision_id]["country"] = row[1]
            page_data[revision_id]["article_name"] = row[0]
            page_data[revision_id]["revision_id"] = row[2] 
            
        skip_lines = skip_lines+1
# print(page_data)

In [4]:
# Data Source 2 : 
'''
{
    "country_name_1":"population",
    "country_name_1":"population"
}

'''
population_data = {}

# skip twolines from input csv file 
skip_lines = 1
with open('Population Mid-2015.csv', encoding="utf8") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        # last line in the raw data file is an empty line
        # skip_lines > 2 skips first two lines
        # row checks if the read list is empty
        #print(row)
        if(skip_lines>3 and row):
            # ['Afghanistan', 'Country', 'Mid-2015', 'Number', '32,247,000', '']
            population_data[row[0]] = row[4]
        skip_lines = skip_lines + 1
        
print(population_data)


{'Afghanistan': '32,247,000', 'Albania': '2,892,000', 'Algeria': '39,948,000', 'Andorra': '78,000', 'Angola': '25,000,000', 'Antigua and Barbuda': '90,000', 'Argentina': '42,426,000', 'Armenia': '3,017,106', 'Australia': '23,888,000', 'Austria': '8,615,955', 'Azerbaijan': '9,651,000', 'Bahamas': '377,000', 'Bahrain': '1,412,299', 'Bangladesh': '160,411,000', 'Barbados': '278,000', 'Belarus': '9,524,247', 'Belgium': '11,211,064', 'Belize': '368,000', 'Benin': '10,583,034', 'Bhutan': '757,000', 'Bolivia': '10,475,500', 'Bosnia-Herzegovina': '3,650,114', 'Botswana': '2,139,900', 'Brazil': '204,519,398', 'Brunei': '413,000', 'Bulgaria': '7,181,000', 'Burkina Faso': '18,450,400', 'Burundi': '10,742,000', 'Cambodia': '15,417,100', 'Cameroon': '23,739,000', 'Canada': '35,833,000', 'Cape Verde': '514,000', 'Central African Republic': '5,551,900', 'Chad': '13,707,000', 'Channel Islands': '164,000', 'Chile': '18,025,000', 'China': '1,371,920,000', 'Colombia': '48,218,000', 'Comoros': '764,000', 'Congo': '4,755,000', 'Congo, Dem. Rep. of': '73,340,200', 'Costa Rica': '4,832,000', "Cote d'Ivoire": '23,281,300', 'Croatia': '4,215,000', 'Cuba': '11,139,000', 'Curacao': '158,000', 'Cyprus': '1,153,000', 'Czech Republic': '10,551,227', 'Denmark': '5,676,025', 'Djibouti': '900,000', 'Dominica': '68,000', 'Dominican Republic': '10,508,000', 'Ecuador': '16,279,000', 'Egypt': '89,074,000', 'El Salvador': '6,366,000', 'Equatorial Guinea': '805,000', 'Eritrea': '5,200,000', 'Estonia': '1,310,504', 'Ethiopia': '98,148,000', 'Federated States of Micronesia': '103,000', 'Fiji': '867,000', 'Finland': '5,476,031', 'France': '64,346,720', 'French Guiana': '251,000', 'French Polynesia': '263,000', 'Gabon': '1,751,000', 'Gambia': '2,021,893', 'Georgia': '3,804,000', 'Germany': '81,132,000', 'Ghana': '27,672,800', 'Greece': '11,520,785', 'Grenada': '111,000', 'Guadeloupe': '407,000', 'Guam': '184,200', 'Guatemala': '16,183,752', 'Guinea': '10,985,600', 'Guinea-Bissau': '1,788,000', 'Guyana': '743,000', 'Haiti': '10,924,000', 'Honduras': '8,340,000', 'Hong Kong, SAR': '7,286,402', 'Hungary': '9,835,030', 'Iceland': '330,828', 'India': '1,314,097,616', 'Indonesia': '255,741,973', 'Iran': '78,483,446', 'Iraq': '37,056,000', 'Ireland': '4,630,308', 'Israel': '8,375,384', 'Italy': '62,466,780', 'Jamaica': '2,727,000', 'Japan': '126,866,820', 'Jordan': '8,118,000', 'Kazakhstan': '17,544,274', 'Kenya': '44,306,000', 'Kiribati': '113,400', 'Korea, North': '24,983,000', 'Korea, South': '50,713,867', 'Kosovo': '1,802,000', 'Kuwait': '3,837,700', 'Kyrgyzstan': '5,951,000', 'Laos': '6,903,049', 'Latvia': '1,978,454', 'Lebanon': '6,185,000', 'Lesotho': '1,924,381', 'Liberia': '4,503,000', 'Libya': '6,317,000', 'Liechtenstein': '37,570', 'Lithuania': '2,911,203', 'Luxembourg': '569,202', 'Macao, SAR': '658,611', 'Macedonia': '2,070,100', 'Madagascar': '23,047,400', 'Malawi': '17,174,000', 'Malaysia': '30,788,840', 'Maldives': '346,946', 'Mali': '16,749,000', 'Malta': '431,486', 'Marshall Islands': '55,000', 'Martinique': '379,000', 'Mauritania': '3,641,288', 'Mauritius': '1,262,660', 'Mayotte': '229,890', 'Mexico': '127,017,000', 'Moldova': '4,109,000', 'Monaco': '38,088', 'Mongolia': '3,029,335', 'Montenegro': '622,421', 'Morocco': '34,121,000', 'Mozambique': '25,736,000', 'Myanmar': '52,147,000', 'Namibia': '2,482,100', 'Nauru': '10,860', 'Nepal': '28,039,000', 'Netherlands': '16,942,373', 'New Caledonia': '271,974', 'New Zealand': '4,598,066', 'Nicaragua': '6,262,000', 'Niger': '18,884,462', 'Nigeria': '181,839,400', 'Norway': '5,194,411', 'Oman': '4,201,000', 'Pakistan': '199,047,300', 'Palau': '18,000', 'Palestinian Territory': '4,481,195', 'Panama': '3,980,000', 'Papua New Guinea': '7,744,600', 'Paraguay': '7,020,000', 'Peru': '31,151,643', 'Philippines': '102,965,300', 'Poland': '38,478,001', 'Portugal': '10,349,000', 'Puerto Rico': '3,502,000', 'Qatar': '2,394,524', 'Reunion': '851,060', 'Romania': '19,838,662', 'Russia': '144,302,000', 'Rwanda': '11,331,300', 'Samoa': '194,210', 'San Marino': '33,000', 'Sao Tome and Principe': '195,570', 'Saudi Arabia': '31,565,109', 'Senegal': '14,690,400', 'Serbia': '7,097,190', 'Seychelles': '92,833', 'Sierra Leone': '6,502,960', 'Singapore': '5,541,121', 'Slovakia': '5,424,051', 'Slovenia': '2,064,000', 'Solomon Islands': '641,900', 'Somalia': '11,123,000', 'South Africa': '55,041,000', 'South Sudan': '12,152,000', 'Spain': '46,368,000', 'Sri Lanka': '20,868,800', 'St. Kitts-Nevis': '46,000', 'St. Lucia': '175,000', 'St. Vincent & the Grenadines': '110,000', 'Sudan': '40,883,900', 'Suriname': '576,000', 'Swaziland': '1,286,000', 'Sweden': '9,804,792', 'Switzerland': '8,292,851', 'Syria': '17,065,000', 'Taiwan': '23,468,000', 'Tajikistan': '8,452,153', 'Tanzania': '52,291,000', 'Thailand': '65,121,250', 'Timor-Leste': '1,244,759', 'Togo': '7,231,000', 'Tonga': '103,300', 'Trinidad and Tobago': '1,351,000', 'Tunisia': '11,026,000', 'Turkey': '78,215,000', 'Turkmenistan': '5,373,000', 'Tuvalu': '11,800', 'Uganda': '40,141,000', 'Ukraine': '42,828,300', 'United Arab Emirates': '9,577,000', 'United Kingdom': '65,092,000', 'United States': '321,234,172', 'Uruguay': '3,562,000', 'Uzbekistan': '31,290,791', 'Vanuatu': '277,500', 'Venezuela': '30,620,000', 'Vietnam': '91,714,080', 'Western Sahara': '604,000', 'Yemen': '26,737,000', 'Zambia': '15,473,900', 'Zimbabwe': '17,354,000'}

In [5]:
# Merging data set from wikipedia data(page_data) and population data(population_data)
# reading data from page_data and including poulation data
'''
{
    "revision_id": {
        "country": "",
        "article_name": "",
        "revision_id":"",
        "population":""
    },
    "revision_id": {
    
    },
    ...
}
'''
count=0
for key,value in page_data.items():
    revision_id = key
    #print(key)
    
    country_page_data = value["country"]
    #print(country_page_data)
    
    if population_data.get(str(country_page_data)) is not None:
        page_data[revision_id]["population"] = population_data[country_page_data]
    else:
        # for now setting population as 0 for countries which are not preset in population_data
        page_data[revision_id]["population"] = 0
        count = count+1
        
#print("no of entries which have popluation 0")      
#print(count)

Getting article quality predictions


In [6]:
# Data Source 3 : 
import requests
import json

headers = {'User-Agent' : 'https://github.com/abhishekanand', 'From' : 'anand1@uw.edu'}

def get_ores_data(revision_ids, headers):
    
    # Define the endpoint
    endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
    
    # Specify the parameters - smushing all the revision IDs together separated by | marks.
    # Yes, 'smush' is a technical term, trust me I'm a scientist.
    # What do you mean "but people trusting scientists regularly goes horribly wrong" who taught you tha- oh.  
    params = {'project' : 'enwiki',
              'model'   : 'wp10',
              'revids'  : '|'.join(str(x) for x in revision_ids)
              }
    api_call = requests.get(endpoint.format(**params))
    response = api_call.json()
    json.dumps(response, indent=4, sort_keys=True)
    #print(response)
    return response


# So if we grab some example revision IDs and turn them into a list and then call get_ores_data...
#example_ids = [783381498, 807355596, 757539710]
#get_ores_data(example_ids, headers)


#get_ores_data(example_ids, headers)
no_revision_ids = len(page_data)
#print(no_revision_ids)

# getting all the revision ids from page_data in list format
revision_ids = list(page_data.keys())


counter = 0
call_count = 0

# this contains revsion id as key and article quality as value
data_revision_quality  = {}



# prediction is being added in page_data
'''
{
    "revision_id": {
        "country": "",
        "article_name": "",
        "revision_id":"",
        "population":"",
        "article_quality":""
    },
    "revision_id": {
    
    },
    ...
}
'''

while(counter<no_revision_ids):
    temp = get_ores_data(revision_ids[counter:counter+100], headers)
    hundred_revisions = temp["enwiki"]["scores"]
    
    for key, value in hundred_revisions.items():
        revision = key
        if value["wp10"].get("score") is not None:
            prediction = value["wp10"]["score"]["prediction"]
            page_data[revision]["article_quality"] = prediction
            #print(page_data[revision])
        else:
            page_data[revision]["article_quality"] = 'NA'
        
        
    counter = counter + 100
    call_count = call_count + 1
    
    
if(counter>no_revision_ids):
    temp = get_ores_data(revision_ids[counter-100:(counter-100)+no_revision_ids%100], headers)
    left_revisions = temp["enwiki"]["scores"]
    
    counter = counter-100
    call_count = call_count + 1
    
    for key, value in left_revisions.items():
        revision = key
        counter = counter + 1
        if value["wp10"].get("score") is not None:
            prediction = value["wp10"]["score"]["prediction"]
            page_data[revision]["article_quality"] = prediction
            #print(page_data[revision])

In [7]:
#print(counter)  # Numbe of entries received from the API Call 
#print(call_count)
#print(no_revision_ids%100)

In [8]:
# Cleaning Data Set to contain following values only 
    #country
    #article_name
    #revision_id
    #article_quality
    #population

    
combined_data = [] # Empty List 
for key, value in page_data.items():
    combined_data.append(value)
    
'''
printing first 100 revision ids because of 
"OPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`."
'''
for i in range(100):    
    print(combined_data[i])


{'country': 'Zambia', 'article_name': 'Template:ZambiaProvincialMinisters', 'revision_id': '235107991', 'population': '15,473,900', 'article_quality': 'Stub'}
{'country': 'Chad', 'article_name': 'Bir I of Kanem', 'revision_id': '355319463', 'population': '13,707,000', 'article_quality': 'Stub'}
{'country': 'Zimbabwe', 'article_name': 'Template:Zimbabwe-politician-stub', 'revision_id': '391862046', 'population': '17,354,000', 'article_quality': 'Stub'}
{'country': 'Uganda', 'article_name': 'Template:Uganda-politician-stub', 'revision_id': '391862070', 'population': '40,141,000', 'article_quality': 'Stub'}
{'country': 'Namibia', 'article_name': 'Template:Namibia-politician-stub', 'revision_id': '391862409', 'population': '2,482,100', 'article_quality': 'Stub'}
{'country': 'Nigeria', 'article_name': 'Template:Nigeria-politician-stub', 'revision_id': '391862819', 'population': '181,839,400', 'article_quality': 'Stub'}
{'country': 'Colombia', 'article_name': 'Template:Colombia-politician-stub', 'revision_id': '391863340', 'population': '48,218,000', 'article_quality': 'Stub'}
{'country': 'Chile', 'article_name': 'Template:Chile-politician-stub', 'revision_id': '391863361', 'population': '18,025,000', 'article_quality': 'Stub'}
{'country': 'Fiji', 'article_name': 'Template:Fiji-politician-stub', 'revision_id': '391863617', 'population': '867,000', 'article_quality': 'Stub'}
{'country': 'Solomon Islands', 'article_name': 'Template:Solomons-politician-stub', 'revision_id': '391863809', 'population': '641,900', 'article_quality': 'Stub'}
{'country': 'Palestinian Territory', 'article_name': 'Information Minister of the Palestinian National Authority', 'revision_id': '393276188', 'population': '4,481,195', 'article_quality': 'Stub'}
{'country': 'Somalia', 'article_name': 'Template:Somalia-politician-stub', 'revision_id': '393298432', 'population': '11,123,000', 'article_quality': 'Stub'}
{'country': 'Cambodia', 'article_name': 'Yos Por', 'revision_id': '393822005', 'population': '15,417,100', 'article_quality': 'Stub'}
{'country': 'Slovakia', 'article_name': 'Template:Slovakia-politician-stub', 'revision_id': '394482629', 'population': '5,424,051', 'article_quality': 'Stub'}
{'country': 'Slovenia', 'article_name': 'Template:Slovenia-politician-stub', 'revision_id': '394482891', 'population': '2,064,000', 'article_quality': 'Stub'}
{'country': 'Afghanistan', 'article_name': 'Template:Afghanistan-politician-stub', 'revision_id': '394580295', 'population': '32,247,000', 'article_quality': 'Stub'}
{'country': 'Iraq', 'article_name': 'Template:Iraq-politician-stub', 'revision_id': '394580630', 'population': '37,056,000', 'article_quality': 'Stub'}
{'country': 'Nepal', 'article_name': 'Template:Nepal-politician-stub', 'revision_id': '394580939', 'population': '28,039,000', 'article_quality': 'Stub'}
{'country': 'Sri Lanka', 'article_name': 'Template:SriLanka-politician-stub', 'revision_id': '394580993', 'population': '20,868,800', 'article_quality': 'Stub'}
{'country': 'Laos', 'article_name': 'Template:Laos-politician-stub', 'revision_id': '394581284', 'population': '6,903,049', 'article_quality': 'Stub'}
{'country': 'Albania', 'article_name': 'Template:Albania-politician-stub', 'revision_id': '394581557', 'population': '2,892,000', 'article_quality': 'Stub'}
{'country': 'Costa Rica', 'article_name': 'Template:CostaRica-politician-stub', 'revision_id': '394587483', 'population': '4,832,000', 'article_quality': 'Stub'}
{'country': 'Hondura', 'article_name': 'Template:Honduras-politician-stub', 'revision_id': '394587547', 'population': 0, 'article_quality': 'Stub'}
{'country': 'Czech Republic', 'article_name': 'Julius Gregr', 'revision_id': '395521877', 'population': '10,551,227', 'article_quality': 'Stub'}
{'country': 'Czech Republic', 'article_name': 'Edvard Gregr', 'revision_id': '395526568', 'population': '10,551,227', 'article_quality': 'Stub'}
{'country': 'Canada', 'article_name': 'Robert Douglas Cook', 'revision_id': '401577829', 'population': '35,833,000', 'article_quality': 'Stub'}
{'country': 'Tunisia', 'article_name': 'Template:Tunisia-politician-stub', 'revision_id': '413885084', 'population': '11,026,000', 'article_quality': 'Stub'}
{'country': 'Guatemala', 'article_name': 'Template:Guatemala-politician-stub', 'revision_id': '433871129', 'population': '16,183,752', 'article_quality': 'Stub'}
{'country': 'Burkina Faso', 'article_name': 'Template:BurkinaFaso-politician-stub', 'revision_id': '433871165', 'population': '18,450,400', 'article_quality': 'Stub'}
{'country': 'Angola', 'article_name': 'Template:Angola-politician-stub', 'revision_id': '435008715', 'population': '25,000,000', 'article_quality': 'Stub'}
{'country': 'Panama', 'article_name': 'Template:Panama-politician-stub', 'revision_id': '437454659', 'population': '3,980,000', 'article_quality': 'Stub'}
{'country': 'Japan', 'article_name': 'Template:Japan-politician-1980s-stub', 'revision_id': '437735138', 'population': '126,866,820', 'article_quality': 'Stub'}
{'country': 'Indonesia', 'article_name': 'Template:Indonesia-politician-stub', 'revision_id': '438305657', 'population': '255,741,973', 'article_quality': 'Stub'}
{'country': 'Madagascar', 'article_name': 'Template:Madagascar-politician-stub', 'revision_id': '439671509', 'population': '23,047,400', 'article_quality': 'Stub'}
{'country': 'Malaysia', 'article_name': 'Template:Malaysia-politician-stub', 'revision_id': '439708117', 'population': '30,788,840', 'article_quality': 'Stub'}
{'country': 'Gabon', 'article_name': 'Template:Gabon-politician-stub', 'revision_id': '440397578', 'population': '1,751,000', 'article_quality': 'Stub'}
{'country': 'Germany', 'article_name': 'Template:Germany-DDP-politician-stub', 'revision_id': '440594068', 'population': '81,132,000', 'article_quality': 'Stub'}
{'country': 'Germany', 'article_name': 'Template:Germany-Centre-politician-stub', 'revision_id': '440598656', 'population': '81,132,000', 'article_quality': 'Stub'}
{'country': 'Liberia', 'article_name': 'Template:Liberia-politician-stub', 'revision_id': '441172886', 'population': '4,503,000', 'article_quality': 'Stub'}
{'country': 'Ghana', 'article_name': 'Template:Ghana-politician-stub', 'revision_id': '441186581', 'population': '27,672,800', 'article_quality': 'Stub'}
{'country': 'Peru', 'article_name': 'Template:Peru-politician-stub', 'revision_id': '441771813', 'population': '31,151,643', 'article_quality': 'Stub'}
{'country': 'Argentina', 'article_name': 'Template:Argentina-politician-stub', 'revision_id': '441995465', 'population': '42,426,000', 'article_quality': 'Stub'}
{'country': 'Spain', 'article_name': 'Template:Catalonia-politician-stub', 'revision_id': '442411422', 'population': '46,368,000', 'article_quality': 'Stub'}
{'country': 'South Africa', 'article_name': 'Template:SouthAfrica-politician-stub', 'revision_id': '442913438', 'population': '55,041,000', 'article_quality': 'Stub'}
{'country': 'Egypt', 'article_name': 'List of Grand Viziers of Egypt', 'revision_id': '442937236', 'population': '89,074,000', 'article_quality': 'Stub'}
{'country': 'Costa Rica', 'article_name': 'Template:CostaRica-mayor-stub', 'revision_id': '443468553', 'population': '4,832,000', 'article_quality': 'Stub'}
{'country': 'Hondura', 'article_name': 'Template:Honduras-mayor-stub', 'revision_id': '443469862', 'population': 0, 'article_quality': 'Stub'}
{'country': 'Nicaragua', 'article_name': 'Template:Nicaragua-mayor-stub', 'revision_id': '443470532', 'population': '6,262,000', 'article_quality': 'Stub'}
{'country': 'Afghanistan', 'article_name': 'Template:Afghanistan-mayor-stub', 'revision_id': '443496992', 'population': '32,247,000', 'article_quality': 'Stub'}
{'country': 'Bangladesh', 'article_name': 'Template:Bangladesh-mayor-stub', 'revision_id': '443497423', 'population': '160,411,000', 'article_quality': 'Stub'}
{'country': 'Cambodia', 'article_name': 'Template:Cambodia-mayor-stub', 'revision_id': '443497605', 'population': '15,417,100', 'article_quality': 'Stub'}
{'country': 'India', 'article_name': 'Template:India-mayor-stub', 'revision_id': '443498496', 'population': '1,314,097,616', 'article_quality': 'Stub'}
{'country': 'Iran', 'article_name': 'Template:Iran-mayor-stub', 'revision_id': '443499145', 'population': '78,483,446', 'article_quality': 'Stub'}
{'country': 'Iraq', 'article_name': 'Template:Iraq-mayor-stub', 'revision_id': '443499367', 'population': '37,056,000', 'article_quality': 'Stub'}
{'country': 'Argentina', 'article_name': 'Template:Argentina-mayor-stub', 'revision_id': '443521996', 'population': '42,426,000', 'article_quality': 'Stub'}
{'country': 'Philippines', 'article_name': 'Template:Philippines-mayor-stub', 'revision_id': '443628777', 'population': '102,965,300', 'article_quality': 'Stub'}
{'country': 'Sri Lanka', 'article_name': 'Template:SriLanka-mayor-stub', 'revision_id': '443632023', 'population': '20,868,800', 'article_quality': 'Stub'}
{'country': 'Turkey', 'article_name': 'Template:Turkey-mayor-stub', 'revision_id': '443633073', 'population': '78,215,000', 'article_quality': 'Stub'}
{'country': 'Albania', 'article_name': 'Template:Albania-mayor-stub', 'revision_id': '443634517', 'population': '2,892,000', 'article_quality': 'Stub'}
{'country': 'Austria', 'article_name': 'Template:Austria-mayor-stub', 'revision_id': '443640939', 'population': '8,615,955', 'article_quality': 'Stub'}
{'country': 'Azerbaijan', 'article_name': 'Template:Azerbaijan-mayor-stub', 'revision_id': '443641865', 'population': '9,651,000', 'article_quality': 'Stub'}
{'country': 'Haiti', 'article_name': 'Template:Haiti-politician-stub', 'revision_id': '444002354', 'population': '10,924,000', 'article_quality': 'Stub'}
{'country': 'Greece', 'article_name': 'Template:Greece-mayor-stub', 'revision_id': '444023126', 'population': '11,520,785', 'article_quality': 'Stub'}
{'country': 'Hungary', 'article_name': 'Template:Hungary-mayor-stub', 'revision_id': '444023605', 'population': '9,835,030', 'article_quality': 'Stub'}
{'country': 'Iceland', 'article_name': 'Template:Iceland-mayor-stub', 'revision_id': '444023818', 'population': '330,828', 'article_quality': 'Stub'}
{'country': 'Moldova', 'article_name': 'Template:Moldova-mayor-stub', 'revision_id': '444031013', 'population': '4,109,000', 'article_quality': 'Stub'}
{'country': 'Romania', 'article_name': 'Template:Romania-mayor-stub', 'revision_id': '444036980', 'population': '19,838,662', 'article_quality': 'Stub'}
{'country': 'Czech Republic', 'article_name': 'Template:CzechRepublic-mayor-stub', 'revision_id': '444371643', 'population': '10,551,227', 'article_quality': 'Stub'}
{'country': 'Poland', 'article_name': 'Template:Poland-mayor-stub', 'revision_id': '444373997', 'population': '38,478,001', 'article_quality': 'Stub'}
{'country': 'Luxembourg', 'article_name': 'Template:Luxembourg-politician-stub', 'revision_id': '444377000', 'population': '569,202', 'article_quality': 'Stub'}
{'country': 'Luxembourg', 'article_name': 'Template:Luxembourg-mayor-stub', 'revision_id': '444381930', 'population': '569,202', 'article_quality': 'Stub'}
{'country': 'Denmark', 'article_name': 'Template:Denmark-mayor-stub', 'revision_id': '444431830', 'population': '5,676,025', 'article_quality': 'Stub'}
{'country': 'Namibia', 'article_name': 'Template:Namibia-mayor-stub', 'revision_id': '444756008', 'population': '2,482,100', 'article_quality': 'Stub'}
{'country': 'South Africa', 'article_name': 'Template:SouthAfrica-mayor-stub', 'revision_id': '444756369', 'population': '55,041,000', 'article_quality': 'Stub'}
{'country': 'Burkina Faso', 'article_name': 'Template:BurkinaFaso-mayor-stub', 'revision_id': '444759010', 'population': '18,450,400', 'article_quality': 'Stub'}
{'country': 'Gabon', 'article_name': 'Template:Gabon-mayor-stub', 'revision_id': '444762949', 'population': '1,751,000', 'article_quality': 'Stub'}
{'country': 'Kenya', 'article_name': 'Template:Kenya-mayor-stub', 'revision_id': '444766243', 'population': '44,306,000', 'article_quality': 'Stub'}
{'country': 'Liberia', 'article_name': 'Template:Liberia-mayor-stub', 'revision_id': '444766443', 'population': '4,503,000', 'article_quality': 'Stub'}
{'country': 'Somalia', 'article_name': 'Template:Somalia-mayor-stub', 'revision_id': '444767261', 'population': '11,123,000', 'article_quality': 'Stub'}
{'country': 'Tunisia', 'article_name': 'Template:Tunisia-mayor-stub', 'revision_id': '444767517', 'population': '11,026,000', 'article_quality': 'Stub'}
{'country': 'Uganda', 'article_name': 'Template:Uganda-mayor-stub', 'revision_id': '444767727', 'population': '40,141,000', 'article_quality': 'Stub'}
{'country': 'Zimbabwe', 'article_name': 'Template:Zimbabwe-mayor-stub', 'revision_id': '444767985', 'population': '17,354,000', 'article_quality': 'Stub'}
{'country': 'Ecuador', 'article_name': 'Template:Ecuador-mayor-stub', 'revision_id': '445199467', 'population': '16,279,000', 'article_quality': 'Stub'}
{'country': 'Ecuador', 'article_name': 'Template:Ecuador-politician-stub', 'revision_id': '445225417', 'population': '16,279,000', 'article_quality': 'Stub'}
{'country': 'Cambodia', 'article_name': 'Template:Cambodia-politician-stub', 'revision_id': '445365862', 'population': '15,417,100', 'article_quality': 'Stub'}
{'country': 'Salvadoran', 'article_name': 'Template:ElSalvador-politician-stub', 'revision_id': '445403015', 'population': 0, 'article_quality': 'Stub'}
{'country': 'Finland', 'article_name': 'Template:Finland-mayor-stub', 'revision_id': '445676138', 'population': '5,476,031', 'article_quality': 'Stub'}
{'country': 'Portugal', 'article_name': 'Template:Portugal-mayor-stub', 'revision_id': '446008048', 'population': '10,349,000', 'article_quality': 'Stub'}
{'country': 'Switzerland', 'article_name': 'Template:Switzerland-mayor-stub', 'revision_id': '446011535', 'population': '8,292,851', 'article_quality': 'Stub'}
{'country': 'Sweden', 'article_name': 'Template:Sweden-mayor-stub', 'revision_id': '446019004', 'population': '9,804,792', 'article_quality': 'Stub'}
{'country': 'Belgium', 'article_name': 'Template:Belgium-mayor-stub', 'revision_id': '446192353', 'population': '11,211,064', 'article_quality': 'Stub'}
{'country': 'Spain', 'article_name': 'Template:Spain-mayor-stub', 'revision_id': '446194245', 'population': '46,368,000', 'article_quality': 'Stub'}
{'country': 'Italy', 'article_name': 'Template:Italy-mayor-stub', 'revision_id': '446196171', 'population': '62,466,780', 'article_quality': 'Stub'}
{'country': 'Mexico', 'article_name': 'Template:Mexico-mayor-stub', 'revision_id': '446200358', 'population': '127,017,000', 'article_quality': 'Stub'}
{'country': 'Chile', 'article_name': 'Template:Chile-mayor-stub', 'revision_id': '446202781', 'population': '18,025,000', 'article_quality': 'Stub'}
{'country': 'Peru', 'article_name': 'Template:Peru-mayor-stub', 'revision_id': '446205420', 'population': '31,151,643', 'article_quality': 'Stub'}
{'country': 'Bolivia', 'article_name': 'Template:Bolivia-mayor-stub', 'revision_id': '446207794', 'population': '10,475,500', 'article_quality': 'Stub'}
{'country': 'Colombia', 'article_name': 'Template:Colombia-mayor-stub', 'revision_id': '446214240', 'population': '48,218,000', 'article_quality': 'Stub'}
{'country': 'Bulgaria', 'article_name': 'Template:Bulgaria-mayor-stub', 'revision_id': '446219671', 'population': '7,181,000', 'article_quality': 'Stub'}
{'country': 'Serbia', 'article_name': 'Template:Serbia-mayor-stub', 'revision_id': '446221243', 'population': '7,097,190', 'article_quality': 'Stub'}

In [9]:
# The final data file should be named: en-wikipedia_traffic_200801-201709.csv
import csv
CSVOut = "bias_in_data.csv"
with open(CSVOut, "w",encoding="utf-8") as ofile:
    ofile.write("country" + "," + "article_name" + "," + "revision_id"+"," + "article_quality"+"," + "population"+ "\n")
    for item in combined_data:
        #print(item["article_quality"])
        #print(str(item["population"]).replace(',',''))
        ofile.write(str(item["country"]).replace(',','') + "," + str(item["article_name"]).replace(',','') + "," + str(item["revision_id"])+"," + str(item["article_quality"])+"," + str(item["population"]).replace(',','') + "\n")
print("Done")


Done

Analysis

FA - Featured article GA - Good article B - B-class article C - C-class article Start - Start-class article Stub - Stub-class article

if a country has a population of 10,000 people, and you found 10 articles about politicians from that country, then the percentage of articles-per-population would be .1%.


In [10]:
# Creating dataset Country and Article Count  
'''
{
    "country_name_1":"Count of Articles",
    "country_name_1":"Count of Articles"
}

'''

country_articleCount = {}

for item in combined_data:
    country_name = str(item["country"]).replace(',','')
    if (country_name in country_articleCount.keys()):
        if(item["article_name"]) is not None:
            country_articleCount[country_name]=country_articleCount[country_name]+1
    else:
        country_articleCount[country_name]=0
        if(item["article_name"]) is not None:
            country_articleCount[country_name]=1
        
print(country_articleCount)


{'Zambia': 26, 'Chad': 100, 'Zimbabwe': 167, 'Uganda': 188, 'Namibia': 165, 'Nigeria': 684, 'Colombia': 288, 'Chile': 352, 'Fiji': 199, 'Solomon Islands': 98, 'Palestinian Territory': 183, 'Somalia': 339, 'Cambodia': 217, 'Slovakia': 119, 'Slovenia': 59, 'Afghanistan': 327, 'Iraq': 302, 'Nepal': 363, 'Sri Lanka': 465, 'Laos': 109, 'Albania': 460, 'Costa Rica': 150, 'Hondura': 189, 'Czech Republic': 254, 'Canada': 852, 'Tunisia': 140, 'Guatemala': 84, 'Burkina Faso': 97, 'Angola': 110, 'Panama': 109, 'Japan': 441, 'Indonesia': 215, 'Madagascar': 240, 'Malaysia': 391, 'Gabon': 103, 'Germany': 703, 'Liberia': 158, 'Ghana': 395, 'Peru': 354, 'Argentina': 496, 'Spain': 881, 'South Africa': 382, 'Egypt': 239, 'Nicaragua': 116, 'Bangladesh': 324, 'India': 990, 'Iran': 832, 'Philippines': 515, 'Turkey': 353, 'Austria': 340, 'Azerbaijan': 182, 'Haiti': 166, 'Greece': 311, 'Hungary': 614, 'Iceland': 206, 'Moldova': 426, 'Romania': 348, 'Poland': 809, 'Luxembourg': 180, 'Denmark': 291, 'Kenya': 379, 'Ecuador': 187, 'Salvadoran': 119, 'Finland': 572, 'Portugal': 323, 'Switzerland': 407, 'Sweden': 380, 'Belgium': 523, 'Italy': 828, 'Mexico': 1081, 'Bolivia': 187, 'Bulgaria': 226, 'Serbia': 220, 'Russia': 882, 'Tanzania': 408, 'Sierra Leone': 166, 'Pakistan': 1045, 'Croatia': 168, 'Ukraine': 304, 'South Sudan': 133, 'United States': 1098, 'Yemen': 122, 'China': 1138, 'New Zealand': 791, 'Venezuela': 135, 'Australia': 1566, 'Estonia': 153, 'Lebanon': 188, 'Armenia': 199, 'Taiwan': 503, 'Cuba': 176, 'Lithuania': 248, 'Malawi': 122, 'Saint Kitts and Nevis': 32, 'Vietnam': 191, 'France': 1689, 'Norway': 658, 'Ireland': 381, 'Israel': 498, 'Palauan': 23, 'Jamaica': 85, 'Kyrgyzstan': 72, 'San Marino': 82, 'Bosnia-Herzegovina': 178, 'Turkmenistan': 33, 'Algeria': 119, 'French Guiana': 28, 'Djibouti': 39, 'Vanuatu': 62, 'Ivorian': 79, 'Netherlands': 702, 'Libya': 111, 'Malta': 103, 'Paraguay': 149, 'Saint Vincent and the Grenadines': 22, 'Rhodesian': 76, 'Omani': 25, 'Papua New Guinea': 163, 'Congo Dem. Rep. of': 142, 'Togo': 65, 'United Arab Emirates': 60, 'Ethiopia': 105, 'Tuvalu': 55, 'Niuean': 23, 'Antigua and Barbuda': 25, 'Uruguay': 290, 'Senegal': 43, 'Brazil': 556, 'Korea North': 39, 'United Kingdom': 867, 'Botswana': 68, 'Qatar': 51, 'Rwanda': 105, 'Nauru': 53, 'Sudan': 98, 'East Timorese': 36, 'Korea South': 282, 'Mozambique': 60, 'Saudi Arabia': 119, 'Faroese': 74, 'Cape Colony': 82, 'Benin': 94, 'Syria': 132, 'Tajikistan': 40, 'Mauritius': 68, 'Jordan': 48, 'South Korean': 99, 'Monaco': 40, 'Morocco': 208, 'Grenada': 36, 'Samoan': 77, 'Mali': 116, 'Tonga': 63, 'Myanmar': 237, 'Congo': 149, 'Montserratian': 27, 'Guyana': 20, 'Liechtenstein': 29, 'Sao Tome and Principe': 22, 'Guinea-Bissau': 21, 'Belarus': 72, 'Singapore': 69, 'Guinea': 89, 'Pitcairn Islands': 43, 'Marshall Islands': 37, 'Dominican Republic': 64, 'Maldives': 84, 'Kazakhstan': 79, 'Macedonia': 65, 'Kiribati': 32, 'Mongolia': 92, 'Abkhazia': 16, 'Montenegro': 74, 'Bhutan': 33, 'Thailand': 112, 'Latvia': 56, 'Suriname': 40, 'Niger': 80, 'Martinique': 34, 'Mauritania': 52, 'Carniolan': 22, 'Cameroon': 106, 'Lesotho': 30, 'Saint Lucian': 48, 'South African Republic': 15, 'Cyprus': 102, 'Gambia': 82, 'Incan': 7, 'Uzbekistan': 29, 'Bahrain': 42, 'Chechen': 38, 'Eritrea': 16, 'Jersey': 61, 'Kuwait': 37, 'Burundi': 76, 'Guernsey': 25, 'Central African Republic': 68, 'Equatorial Guinea': 32, 'Guadeloupe': 49, 'Kosovo': 48, 'Cape Verde': 37, 'Andorra': 34, 'Comoros': 51, 'South Ossetian': 18, 'Cook Island': 67, 'Trinidad and Tobago': 28, 'Federated States of Micronesia': 38, 'Dominica': 12, 'Tokelauan': 18, 'Bahamas': 20, 'Swaziland': 32, 'Dagestani': 7, 'Greenlandic': 13, 'Barbados': 14, 'Belize': 16, 'Ossetian': 9, 'Seychelles': 22, 'Somaliland': 5, 'Rojava': 3}

In [11]:
# Population Data 
# print(population_data)
# Creating dataste containining Country and Population 
'''
{
    "country_name_1":"Population",
    "country_name_1":"Population"
}

'''

country_population= {}

for key, value in population_data.items():
    countryName = str(key).replace(',','')
    country_population[countryName] = str(population_data[key]).replace(',','')
        
print(country_population)


{'Afghanistan': '32247000', 'Albania': '2892000', 'Algeria': '39948000', 'Andorra': '78000', 'Angola': '25000000', 'Antigua and Barbuda': '90000', 'Argentina': '42426000', 'Armenia': '3017106', 'Australia': '23888000', 'Austria': '8615955', 'Azerbaijan': '9651000', 'Bahamas': '377000', 'Bahrain': '1412299', 'Bangladesh': '160411000', 'Barbados': '278000', 'Belarus': '9524247', 'Belgium': '11211064', 'Belize': '368000', 'Benin': '10583034', 'Bhutan': '757000', 'Bolivia': '10475500', 'Bosnia-Herzegovina': '3650114', 'Botswana': '2139900', 'Brazil': '204519398', 'Brunei': '413000', 'Bulgaria': '7181000', 'Burkina Faso': '18450400', 'Burundi': '10742000', 'Cambodia': '15417100', 'Cameroon': '23739000', 'Canada': '35833000', 'Cape Verde': '514000', 'Central African Republic': '5551900', 'Chad': '13707000', 'Channel Islands': '164000', 'Chile': '18025000', 'China': '1371920000', 'Colombia': '48218000', 'Comoros': '764000', 'Congo': '4755000', 'Congo Dem. Rep. of': '73340200', 'Costa Rica': '4832000', "Cote d'Ivoire": '23281300', 'Croatia': '4215000', 'Cuba': '11139000', 'Curacao': '158000', 'Cyprus': '1153000', 'Czech Republic': '10551227', 'Denmark': '5676025', 'Djibouti': '900000', 'Dominica': '68000', 'Dominican Republic': '10508000', 'Ecuador': '16279000', 'Egypt': '89074000', 'El Salvador': '6366000', 'Equatorial Guinea': '805000', 'Eritrea': '5200000', 'Estonia': '1310504', 'Ethiopia': '98148000', 'Federated States of Micronesia': '103000', 'Fiji': '867000', 'Finland': '5476031', 'France': '64346720', 'French Guiana': '251000', 'French Polynesia': '263000', 'Gabon': '1751000', 'Gambia': '2021893', 'Georgia': '3804000', 'Germany': '81132000', 'Ghana': '27672800', 'Greece': '11520785', 'Grenada': '111000', 'Guadeloupe': '407000', 'Guam': '184200', 'Guatemala': '16183752', 'Guinea': '10985600', 'Guinea-Bissau': '1788000', 'Guyana': '743000', 'Haiti': '10924000', 'Honduras': '8340000', 'Hong Kong SAR': '7286402', 'Hungary': '9835030', 'Iceland': '330828', 'India': '1314097616', 'Indonesia': '255741973', 'Iran': '78483446', 'Iraq': '37056000', 'Ireland': '4630308', 'Israel': '8375384', 'Italy': '62466780', 'Jamaica': '2727000', 'Japan': '126866820', 'Jordan': '8118000', 'Kazakhstan': '17544274', 'Kenya': '44306000', 'Kiribati': '113400', 'Korea North': '24983000', 'Korea South': '50713867', 'Kosovo': '1802000', 'Kuwait': '3837700', 'Kyrgyzstan': '5951000', 'Laos': '6903049', 'Latvia': '1978454', 'Lebanon': '6185000', 'Lesotho': '1924381', 'Liberia': '4503000', 'Libya': '6317000', 'Liechtenstein': '37570', 'Lithuania': '2911203', 'Luxembourg': '569202', 'Macao SAR': '658611', 'Macedonia': '2070100', 'Madagascar': '23047400', 'Malawi': '17174000', 'Malaysia': '30788840', 'Maldives': '346946', 'Mali': '16749000', 'Malta': '431486', 'Marshall Islands': '55000', 'Martinique': '379000', 'Mauritania': '3641288', 'Mauritius': '1262660', 'Mayotte': '229890', 'Mexico': '127017000', 'Moldova': '4109000', 'Monaco': '38088', 'Mongolia': '3029335', 'Montenegro': '622421', 'Morocco': '34121000', 'Mozambique': '25736000', 'Myanmar': '52147000', 'Namibia': '2482100', 'Nauru': '10860', 'Nepal': '28039000', 'Netherlands': '16942373', 'New Caledonia': '271974', 'New Zealand': '4598066', 'Nicaragua': '6262000', 'Niger': '18884462', 'Nigeria': '181839400', 'Norway': '5194411', 'Oman': '4201000', 'Pakistan': '199047300', 'Palau': '18000', 'Palestinian Territory': '4481195', 'Panama': '3980000', 'Papua New Guinea': '7744600', 'Paraguay': '7020000', 'Peru': '31151643', 'Philippines': '102965300', 'Poland': '38478001', 'Portugal': '10349000', 'Puerto Rico': '3502000', 'Qatar': '2394524', 'Reunion': '851060', 'Romania': '19838662', 'Russia': '144302000', 'Rwanda': '11331300', 'Samoa': '194210', 'San Marino': '33000', 'Sao Tome and Principe': '195570', 'Saudi Arabia': '31565109', 'Senegal': '14690400', 'Serbia': '7097190', 'Seychelles': '92833', 'Sierra Leone': '6502960', 'Singapore': '5541121', 'Slovakia': '5424051', 'Slovenia': '2064000', 'Solomon Islands': '641900', 'Somalia': '11123000', 'South Africa': '55041000', 'South Sudan': '12152000', 'Spain': '46368000', 'Sri Lanka': '20868800', 'St. Kitts-Nevis': '46000', 'St. Lucia': '175000', 'St. Vincent & the Grenadines': '110000', 'Sudan': '40883900', 'Suriname': '576000', 'Swaziland': '1286000', 'Sweden': '9804792', 'Switzerland': '8292851', 'Syria': '17065000', 'Taiwan': '23468000', 'Tajikistan': '8452153', 'Tanzania': '52291000', 'Thailand': '65121250', 'Timor-Leste': '1244759', 'Togo': '7231000', 'Tonga': '103300', 'Trinidad and Tobago': '1351000', 'Tunisia': '11026000', 'Turkey': '78215000', 'Turkmenistan': '5373000', 'Tuvalu': '11800', 'Uganda': '40141000', 'Ukraine': '42828300', 'United Arab Emirates': '9577000', 'United Kingdom': '65092000', 'United States': '321234172', 'Uruguay': '3562000', 'Uzbekistan': '31290791', 'Vanuatu': '277500', 'Venezuela': '30620000', 'Vietnam': '91714080', 'Western Sahara': '604000', 'Yemen': '26737000', 'Zambia': '15473900', 'Zimbabwe': '17354000'}

In [12]:
# Creating Country : Proportion  (Number of Article for politicians from a country /Country's Population )
# country_articleCount 
# country_population

'''
{
    "country_name_1":"Proportion",
    "country_name_2":"Proportion"
}

'''

article_proportion  = {}

for key, value in country_articleCount.items():
    if (key in country_population.keys()):
        article_proportion[key] = (int(country_articleCount[key])/int(country_population[key])*100)
    
print (article_proportion)


{'Zambia': 0.00016802486768041668, 'Chad': 0.0007295542423579193, 'Zimbabwe': 0.0009623141638815259, 'Uganda': 0.0004683490695299071, 'Namibia': 0.006647596793038153, 'Nigeria': 0.00037615610258282857, 'Colombia': 0.000597287320087934, 'Chile': 0.0019528432732316228, 'Fiji': 0.02295271049596309, 'Solomon Islands': 0.015267175572519085, 'Palestinian Territory': 0.004083732129487782, 'Somalia': 0.003047738919356289, 'Cambodia': 0.0014075280046182486, 'Slovakia': 0.002193932173572852, 'Slovenia': 0.0028585271317829457, 'Afghanistan': 0.0010140478184017118, 'Iraq': 0.0008149827288428326, 'Nepal': 0.0012946253432718714, 'Sri Lanka': 0.0022282067009123667, 'Laos': 0.001579012404518641, 'Albania': 0.015905947441217153, 'Costa Rica': 0.003104304635761589, 'Czech Republic': 0.00240730296106794, 'Canada': 0.0023776965367119695, 'Tunisia': 0.001269726101940867, 'Guatemala': 0.0005190390955076425, 'Burkina Faso': 0.0005257338594285219, 'Angola': 0.00044, 'Panama': 0.0027386934673366836, 'Japan': 0.0003476086182344604, 'Indonesia': 8.406910976635032e-05, 'Madagascar': 0.0010413322110086171, 'Malaysia': 0.0012699406668130401, 'Gabon': 0.0058823529411764705, 'Germany': 0.000866489178129468, 'Liberia': 0.003508771929824561, 'Ghana': 0.001427394408950305, 'Peru': 0.0011363766591701119, 'Argentina': 0.0011690944232310375, 'Spain': 0.0019000172532781228, 'South Africa': 0.0006940280881524681, 'Egypt': 0.0002683162314480095, 'Nicaragua': 0.0018524433088470138, 'Bangladesh': 0.00020198116089295621, 'India': 7.533686903819783e-05, 'Iran': 0.0010600961634635666, 'Philippines': 0.0005001685033695819, 'Turkey': 0.0004513200792686825, 'Austria': 0.003946167314012202, 'Azerbaijan': 0.0018858149414568437, 'Haiti': 0.0015195898938117906, 'Greece': 0.002699468829597983, 'Hungary': 0.006242990616195375, 'Iceland': 0.06226800633561851, 'Moldova': 0.010367486006327574, 'Romania': 0.0017541505571293066, 'Poland': 0.0021025000753027686, 'Luxembourg': 0.0316232198762478, 'Denmark': 0.0051268273131284655, 'Kenya': 0.0008554146165304924, 'Ecuador': 0.0011487192087966092, 'Finland': 0.01044552158305897, 'Portugal': 0.0031210744999516865, 'Switzerland': 0.004907841706067069, 'Sweden': 0.00387565590376624, 'Belgium': 0.004665034469520467, 'Italy': 0.0013255045321689384, 'Mexico': 0.0008510671799837816, 'Bolivia': 0.0017851176554818384, 'Bulgaria': 0.003147193984124774, 'Serbia': 0.003099818378823168, 'Russia': 0.000611218139734723, 'Tanzania': 0.0007802489912221989, 'Sierra Leone': 0.002552683700960793, 'Pakistan': 0.0005250008415085258, 'Croatia': 0.00398576512455516, 'Ukraine': 0.0007098110361606695, 'South Sudan': 0.0010944700460829493, 'United States': 0.0003418067240990787, 'Yemen': 0.00045629651793394917, 'China': 8.294944311621669e-05, 'New Zealand': 0.017202884865071533, 'Venezuela': 0.00044088830829523186, 'Australia': 0.006555592766242465, 'Estonia': 0.011674897596649839, 'Lebanon': 0.003039611964430073, 'Armenia': 0.006595724512164968, 'Taiwan': 0.002143344128174536, 'Cuba': 0.001580034114372924, 'Lithuania': 0.008518815074043274, 'Malawi': 0.0007103761499941772, 'Vietnam': 0.00020825591882947525, 'France': 0.0026248424162101814, 'Norway': 0.01266746123862744, 'Ireland': 0.008228394309838568, 'Israel': 0.005945996028361207, 'Jamaica': 0.0031169783645031168, 'Kyrgyzstan': 0.0012098806923206186, 'San Marino': 0.24848484848484848, 'Bosnia-Herzegovina': 0.0048765600197692455, 'Turkmenistan': 0.0006141820212171971, 'Algeria': 0.0002978872534294583, 'French Guiana': 0.011155378486055776, 'Djibouti': 0.004333333333333333, 'Vanuatu': 0.022342342342342343, 'Netherlands': 0.004143457353937373, 'Libya': 0.0017571632103846762, 'Malta': 0.023870994655678282, 'Paraguay': 0.0021225071225071225, 'Papua New Guinea': 0.00210469230173282, 'Congo Dem. Rep. of': 0.00019361823392900483, 'Togo': 0.0008989074816761167, 'United Arab Emirates': 0.000626500991959904, 'Ethiopia': 0.00010698129355666953, 'Tuvalu': 0.46610169491525427, 'Antigua and Barbuda': 0.027777777777777776, 'Uruguay': 0.008141493542953397, 'Senegal': 0.0002927081631541687, 'Brazil': 0.00027185685340223815, 'Korea North': 0.00015610615218348477, 'United Kingdom': 0.001331960916856142, 'Botswana': 0.003177718584980607, 'Qatar': 0.0021298596297218155, 'Rwanda': 0.0009266368377856026, 'Nauru': 0.4880294659300184, 'Sudan': 0.0002397031594343984, 'Korea South': 0.0005560609290551636, 'Mozambique': 0.00023313646254274168, 'Saudi Arabia': 0.0003769985397484292, 'Benin': 0.0008882140981499257, 'Syria': 0.0007735130383826547, 'Tajikistan': 0.0004732521997649593, 'Mauritius': 0.005385456100612991, 'Jordan': 0.0005912786400591279, 'Monaco': 0.10501995379122034, 'Morocco': 0.0006095952639137189, 'Grenada': 0.032432432432432434, 'Mali': 0.0006925786614126216, 'Tonga': 0.06098741529525654, 'Myanmar': 0.00045448443822271653, 'Congo': 0.0031335436382754996, 'Guyana': 0.0026917900403768506, 'Liechtenstein': 0.07718924673941975, 'Sao Tome and Principe': 0.011249169095464539, 'Guinea-Bissau': 0.0011744966442953021, 'Belarus': 0.000755965274735105, 'Singapore': 0.00124523539550932, 'Guinea': 0.0008101514710166035, 'Marshall Islands': 0.06727272727272728, 'Dominican Republic': 0.0006090597639893414, 'Maldives': 0.02421126054198636, 'Kazakhstan': 0.0004502893650657759, 'Macedonia': 0.003139944930196609, 'Kiribati': 0.02821869488536155, 'Mongolia': 0.0030369701601176496, 'Montenegro': 0.011889059013111703, 'Bhutan': 0.004359313077939233, 'Thailand': 0.0001719868706451427, 'Latvia': 0.002830492900011827, 'Suriname': 0.006944444444444444, 'Niger': 0.0004236286953793018, 'Martinique': 0.008970976253298154, 'Mauritania': 0.0014280661128699514, 'Cameroon': 0.0004465225999410253, 'Lesotho': 0.0015589428496747785, 'Cyprus': 0.008846487424111016, 'Gambia': 0.004055605316403984, 'Uzbekistan': 9.267902495657588e-05, 'Bahrain': 0.002973874512408491, 'Eritrea': 0.0003076923076923077, 'Kuwait': 0.000964119133856216, 'Burundi': 0.0007075032582386893, 'Central African Republic': 0.0012248059222968713, 'Equatorial Guinea': 0.003975155279503106, 'Guadeloupe': 0.01203931203931204, 'Kosovo': 0.002663706992230855, 'Cape Verde': 0.0071984435797665365, 'Andorra': 0.04358974358974359, 'Comoros': 0.006675392670157068, 'Trinidad and Tobago': 0.002072538860103627, 'Federated States of Micronesia': 0.036893203883495145, 'Dominica': 0.01764705882352941, 'Bahamas': 0.005305039787798409, 'Swaziland': 0.002488335925349922, 'Barbados': 0.005035971223021583, 'Belize': 0.004347826086956522, 'Seychelles': 0.023698469294324214}

if a country has 10 articles about politicians, and 2 of them are FA or GA class articles, then the percentage of high-quality articles would be 20%.


In [13]:
# Creating Creating Country : Proportion    
# Combined Data 
# country_hqarticle
# Creating dataste containining Country and Count of High Quality  Articles  
'''
{
    "country_name_1":"Count High Quality Article",
    "country_name_1":"Count High Quality Article"
}

'''

country_hqarticle = {}

for item in combined_data:
    country_name = str(item["country"]).replace(',','')
    #print(country_name)
    if (country_name in country_hqarticle.keys()):
        if(str(item["article_quality"]) is 'FA' or str(item["article_quality"]) == 'GA'):
            country_hqarticle[country_name]=country_hqarticle[country_name]+1
    else:
        country_hqarticle[country_name]=0
        
print(country_hqarticle)


{'Zambia': 0, 'Chad': 1, 'Zimbabwe': 1, 'Uganda': 1, 'Namibia': 1, 'Nigeria': 4, 'Colombia': 3, 'Chile': 3, 'Fiji': 0, 'Solomon Islands': 0, 'Palestinian Territory': 10, 'Somalia': 8, 'Cambodia': 4, 'Slovakia': 2, 'Slovenia': 1, 'Afghanistan': 12, 'Iraq': 7, 'Nepal': 0, 'Sri Lanka': 8, 'Laos': 1, 'Albania': 5, 'Costa Rica': 0, 'Hondura': 0, 'Czech Republic': 1, 'Canada': 18, 'Tunisia': 1, 'Guatemala': 5, 'Burkina Faso': 2, 'Angola': 1, 'Panama': 4, 'Japan': 8, 'Indonesia': 8, 'Madagascar': 2, 'Malaysia': 6, 'Gabon': 2, 'Germany': 8, 'Liberia': 2, 'Ghana': 4, 'Peru': 1, 'Argentina': 11, 'Spain': 9, 'South Africa': 11, 'Egypt': 7, 'Nicaragua': 0, 'Bangladesh': 3, 'India': 12, 'Iran': 15, 'Philippines': 17, 'Turkey': 4, 'Austria': 3, 'Azerbaijan': 2, 'Haiti': 5, 'Greece': 2, 'Hungary': 3, 'Iceland': 2, 'Moldova': 0, 'Romania': 10, 'Poland': 8, 'Luxembourg': 1, 'Denmark': 3, 'Kenya': 5, 'Ecuador': 2, 'Salvadoran': 1, 'Finland': 0, 'Portugal': 3, 'Switzerland': 0, 'Sweden': 3, 'Belgium': 0, 'Italy': 4, 'Mexico': 5, 'Bolivia': 0, 'Bulgaria': 3, 'Serbia': 1, 'Russia': 25, 'Tanzania': 1, 'Sierra Leone': 0, 'Pakistan': 9, 'Croatia': 2, 'Ukraine': 11, 'South Sudan': 1, 'United States': 57, 'Yemen': 1, 'China': 26, 'New Zealand': 9, 'Venezuela': 3, 'Australia': 32, 'Estonia': 1, 'Lebanon': 6, 'Armenia': 4, 'Taiwan': 6, 'Cuba': 2, 'Lithuania': 1, 'Malawi': 3, 'Saint Kitts and Nevis': 0, 'Vietnam': 10, 'France': 17, 'Norway': 3, 'Ireland': 20, 'Israel': 13, 'Palauan': 1, 'Jamaica': 4, 'Kyrgyzstan': 1, 'San Marino': 0, 'Bosnia-Herzegovina': 6, 'Turkmenistan': 0, 'Algeria': 2, 'French Guiana': 0, 'Djibouti': 0, 'Vanuatu': 3, 'Ivorian': 1, 'Netherlands': 7, 'Libya': 2, 'Malta': 0, 'Paraguay': 1, 'Saint Vincent and the Grenadines': 0, 'Rhodesian': 3, 'Omani': 0, 'Papua New Guinea': 9, 'Congo Dem. Rep. of': 7, 'Togo': 1, 'United Arab Emirates': 2, 'Ethiopia': 3, 'Tuvalu': 3, 'Niuean': 0, 'Antigua and Barbuda': 0, 'Uruguay': 2, 'Senegal': 1, 'Brazil': 4, 'Korea North': 7, 'United Kingdom': 34, 'Botswana': 1, 'Qatar': 2, 'Rwanda': 0, 'Nauru': 0, 'Sudan': 2, 'East Timorese': 0, 'Korea South': 4, 'Mozambique': 0, 'Saudi Arabia': 12, 'Faroese': 0, 'Cape Colony': 2, 'Benin': 4, 'Syria': 4, 'Tajikistan': 0, 'Mauritius': 1, 'Jordan': 1, 'South Korean': 0, 'Monaco': 0, 'Morocco': 1, 'Grenada': 2, 'Samoan': 2, 'Mali': 2, 'Tonga': 0, 'Myanmar': 7, 'Congo': 1, 'Montserratian': 0, 'Guyana': 1, 'Liechtenstein': 0, 'Sao Tome and Principe': 0, 'Guinea-Bissau': 2, 'Belarus': 0, 'Singapore': 4, 'Guinea': 2, 'Pitcairn Islands': 0, 'Marshall Islands': 0, 'Dominican Republic': 0, 'Maldives': 1, 'Kazakhstan': 0, 'Macedonia': 0, 'Kiribati': 0, 'Mongolia': 3, 'Abkhazia': 1, 'Montenegro': 2, 'Bhutan': 3, 'Thailand': 3, 'Latvia': 1, 'Suriname': 0, 'Niger': 3, 'Martinique': 1, 'Mauritania': 3, 'Carniolan': 0, 'Cameroon': 1, 'Lesotho': 0, 'Saint Lucian': 1, 'South African Republic': 1, 'Cyprus': 1, 'Gambia': 6, 'Incan': 0, 'Uzbekistan': 3, 'Bahrain': 0, 'Chechen': 2, 'Eritrea': 0, 'Jersey': 1, 'Kuwait': 1, 'Burundi': 1, 'Guernsey': 1, 'Central African Republic': 5, 'Equatorial Guinea': 1, 'Guadeloupe': 0, 'Kosovo': 1, 'Cape Verde': 0, 'Andorra': 0, 'Comoros': 0, 'South Ossetian': 1, 'Cook Island': 0, 'Trinidad and Tobago': 1, 'Federated States of Micronesia': 0, 'Dominica': 1, 'Tokelauan': 0, 'Bahamas': 0, 'Swaziland': 0, 'Dagestani': 0, 'Greenlandic': 0, 'Barbados': 0, 'Belize': 0, 'Ossetian': 0, 'Seychelles': 0, 'Somaliland': 0, 'Rojava': 0}

In [14]:
# Creating Country and High Quality Article proportion (Number of High quality Article /Country's Population )
# country_articleCount 
# country_hqarticle

'''
{
    "country_name_1":"HqProportion",
    "country_name_2":"hqProportion"
}

'''

hqarticle_proportion  = {}

for key, value in country_articleCount.items():
    if (key in country_hqarticle.keys() and str(country_hqarticle[key]) != '0' ):
        #print(country_hqarticle[key])
        #print(country_articleCount[key])
        hqarticle_proportion[key] = (int(country_hqarticle[key])/int(country_articleCount[key])*100)
        #hqarticle_proportion[key]
        
print (hqarticle_proportion)


{'Chad': 1.0, 'Zimbabwe': 0.5988023952095809, 'Uganda': 0.5319148936170213, 'Namibia': 0.6060606060606061, 'Nigeria': 0.5847953216374269, 'Colombia': 1.0416666666666665, 'Chile': 0.8522727272727272, 'Palestinian Territory': 5.46448087431694, 'Somalia': 2.359882005899705, 'Cambodia': 1.8433179723502304, 'Slovakia': 1.680672268907563, 'Slovenia': 1.694915254237288, 'Afghanistan': 3.669724770642202, 'Iraq': 2.3178807947019866, 'Sri Lanka': 1.7204301075268817, 'Laos': 0.9174311926605505, 'Albania': 1.0869565217391304, 'Czech Republic': 0.39370078740157477, 'Canada': 2.112676056338028, 'Tunisia': 0.7142857142857143, 'Guatemala': 5.952380952380952, 'Burkina Faso': 2.0618556701030926, 'Angola': 0.9090909090909091, 'Panama': 3.669724770642202, 'Japan': 1.8140589569160999, 'Indonesia': 3.7209302325581395, 'Madagascar': 0.8333333333333334, 'Malaysia': 1.5345268542199488, 'Gabon': 1.9417475728155338, 'Germany': 1.1379800853485065, 'Liberia': 1.2658227848101267, 'Ghana': 1.0126582278481013, 'Peru': 0.2824858757062147, 'Argentina': 2.217741935483871, 'Spain': 1.0215664018161181, 'South Africa': 2.8795811518324608, 'Egypt': 2.928870292887029, 'Bangladesh': 0.9259259259259258, 'India': 1.2121212121212122, 'Iran': 1.8028846153846152, 'Philippines': 3.300970873786408, 'Turkey': 1.13314447592068, 'Austria': 0.8823529411764706, 'Azerbaijan': 1.098901098901099, 'Haiti': 3.0120481927710845, 'Greece': 0.6430868167202572, 'Hungary': 0.4885993485342019, 'Iceland': 0.9708737864077669, 'Romania': 2.8735632183908044, 'Poland': 0.9888751545117428, 'Luxembourg': 0.5555555555555556, 'Denmark': 1.0309278350515463, 'Kenya': 1.3192612137203166, 'Ecuador': 1.06951871657754, 'Salvadoran': 0.8403361344537815, 'Portugal': 0.9287925696594427, 'Sweden': 0.7894736842105263, 'Italy': 0.4830917874396135, 'Mexico': 0.46253469010175763, 'Bulgaria': 1.3274336283185841, 'Serbia': 0.45454545454545453, 'Russia': 2.8344671201814062, 'Tanzania': 0.24509803921568626, 'Pakistan': 0.8612440191387559, 'Croatia': 1.1904761904761905, 'Ukraine': 3.618421052631579, 'South Sudan': 0.7518796992481203, 'United States': 5.191256830601093, 'Yemen': 0.819672131147541, 'China': 2.2847100175746924, 'New Zealand': 1.1378002528445006, 'Venezuela': 2.2222222222222223, 'Australia': 2.0434227330779056, 'Estonia': 0.6535947712418301, 'Lebanon': 3.1914893617021276, 'Armenia': 2.0100502512562812, 'Taiwan': 1.1928429423459244, 'Cuba': 1.1363636363636365, 'Lithuania': 0.4032258064516129, 'Malawi': 2.459016393442623, 'Vietnam': 5.2356020942408374, 'France': 1.0065127294256957, 'Norway': 0.4559270516717325, 'Ireland': 5.2493438320209975, 'Israel': 2.610441767068273, 'Palauan': 4.3478260869565215, 'Jamaica': 4.705882352941177, 'Kyrgyzstan': 1.3888888888888888, 'Bosnia-Herzegovina': 3.3707865168539324, 'Algeria': 1.680672268907563, 'Vanuatu': 4.838709677419355, 'Ivorian': 1.2658227848101267, 'Netherlands': 0.9971509971509971, 'Libya': 1.8018018018018018, 'Paraguay': 0.6711409395973155, 'Rhodesian': 3.9473684210526314, 'Papua New Guinea': 5.521472392638037, 'Congo Dem. Rep. of': 4.929577464788732, 'Togo': 1.5384615384615385, 'United Arab Emirates': 3.3333333333333335, 'Ethiopia': 2.857142857142857, 'Tuvalu': 5.454545454545454, 'Uruguay': 0.6896551724137931, 'Senegal': 2.3255813953488373, 'Brazil': 0.7194244604316548, 'Korea North': 17.94871794871795, 'United Kingdom': 3.9215686274509802, 'Botswana': 1.4705882352941175, 'Qatar': 3.9215686274509802, 'Sudan': 2.0408163265306123, 'Korea South': 1.4184397163120568, 'Saudi Arabia': 10.084033613445378, 'Cape Colony': 2.4390243902439024, 'Benin': 4.25531914893617, 'Syria': 3.0303030303030303, 'Mauritius': 1.4705882352941175, 'Jordan': 2.083333333333333, 'Morocco': 0.4807692307692308, 'Grenada': 5.555555555555555, 'Samoan': 2.5974025974025974, 'Mali': 1.7241379310344827, 'Myanmar': 2.9535864978902953, 'Congo': 0.6711409395973155, 'Guyana': 5.0, 'Guinea-Bissau': 9.523809523809524, 'Singapore': 5.797101449275362, 'Guinea': 2.247191011235955, 'Maldives': 1.1904761904761905, 'Mongolia': 3.260869565217391, 'Abkhazia': 6.25, 'Montenegro': 2.7027027027027026, 'Bhutan': 9.090909090909092, 'Thailand': 2.6785714285714284, 'Latvia': 1.7857142857142856, 'Niger': 3.75, 'Martinique': 2.941176470588235, 'Mauritania': 5.769230769230769, 'Cameroon': 0.9433962264150944, 'Saint Lucian': 2.083333333333333, 'South African Republic': 6.666666666666667, 'Cyprus': 0.9803921568627451, 'Gambia': 7.317073170731707, 'Uzbekistan': 10.344827586206897, 'Chechen': 5.263157894736842, 'Jersey': 1.639344262295082, 'Kuwait': 2.7027027027027026, 'Burundi': 1.3157894736842104, 'Guernsey': 4.0, 'Central African Republic': 7.352941176470589, 'Equatorial Guinea': 3.125, 'Kosovo': 2.083333333333333, 'South Ossetian': 5.555555555555555, 'Trinidad and Tobago': 3.571428571428571, 'Dominica': 8.333333333333332}

Visualization

The visualization should be pretty straightforward. Produce four visualizations that show:

10 highest-ranked countries in terms of number of politician articles as a proportion of country population 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population 10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country 10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country


In [15]:
# 10 highest-ranked countries in terms of number of politician articles as a proportion of country population

import pandas as pd
from collections import OrderedDict
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline

In [16]:
# sorts article_proportion dictionary in descending order
sorted_descending_article_proportion = [(k, article_proportion[k]) for k in sorted(article_proportion, key=article_proportion.get, reverse=True)]
#for k, v in sorted_top_10_article_proportion:
   # print(k, v)

sorted_top_10_article_proportion={}
count =0
for k, v in sorted_descending_article_proportion:
    count =count+1
    sorted_top_10_article_proportion[k] =v
    if count==10:
        break
print(sorted_top_10_article_proportion)


{'Nauru': 0.4880294659300184, 'Tuvalu': 0.46610169491525427, 'San Marino': 0.24848484848484848, 'Monaco': 0.10501995379122034, 'Liechtenstein': 0.07718924673941975, 'Marshall Islands': 0.06727272727272728, 'Iceland': 0.06226800633561851, 'Tonga': 0.06098741529525654, 'Andorra': 0.04358974358974359, 'Federated States of Micronesia': 0.036893203883495145}

In [17]:
# https://stackoverflow.com/questions/18837262/convert-python-dict-into-a-dataframe 
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html 
ad= pd.Series(sorted_top_10_article_proportion) 
adata = {'proportion ':ad}
adf = pd.DataFrame(adata)
adf
adf.plot(kind='bar',figsize=(12,10) )
plt.xlabel('Country')
plt.ylabel('Proportion [Total wikipedia Article/ Population]')

plt.title('10 highest-ranked countries in terms of number of politician articles as a proportion of country population ')
plt.show()



In [18]:
print("10 highest-ranked countries in terms of number of politician articles as a proportion of country population")
print("")



print("COUNTRY                          PROPORTION")
print("-------------------------------------------")
print (ad.to_string(index=True))


10 highest-ranked countries in terms of number of politician articles as a proportion of country population

COUNTRY                          PROPORTION
-------------------------------------------
Andorra                           0.043590
Federated States of Micronesia    0.036893
Iceland                           0.062268
Liechtenstein                     0.077189
Marshall Islands                  0.067273
Monaco                            0.105020
Nauru                             0.488029
San Marino                        0.248485
Tonga                             0.060987
Tuvalu                            0.466102

In [19]:
# 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population
# sorts article_proportion dictionary in ascending order
sorted_ascending_article_proportion = [(k, article_proportion[k]) for k in sorted(article_proportion, key=article_proportion.get, reverse=False)]
#for k, v in sorted_bottom_10_article_proportion:
    #print(k, v)

sorted_bottom_10_article_proportion={}
count =0
for k, v in sorted_ascending_article_proportion:
    count =count+1
    sorted_bottom_10_article_proportion[k] =v
    if count==10:
        break
print(sorted_bottom_10_article_proportion)


{'India': 7.533686903819783e-05, 'China': 8.294944311621669e-05, 'Indonesia': 8.406910976635032e-05, 'Uzbekistan': 9.267902495657588e-05, 'Ethiopia': 0.00010698129355666953, 'Korea North': 0.00015610615218348477, 'Zambia': 0.00016802486768041668, 'Thailand': 0.0001719868706451427, 'Congo Dem. Rep. of': 0.00019361823392900483, 'Bangladesh': 0.00020198116089295621}

In [39]:
bd = pd.Series(sorted_bottom_10_article_proportion) 
bdata = {'proportion ':bd}
bdf = pd.DataFrame(bdata)
bdf
bdf.plot(kind='bar',figsize=(12,10), title='10 lowest-ranked countries in terms of number of politician articles as a proportion of country population' )
plt.xlabel('Country')
plt.ylabel('Proportion [Total wikipedia Article/ Population]')
plt.show()



In [21]:
print("10 lowest-ranked countries in terms of number of politician articles as a proportion of country population")
print("")


print("COUNTRY              PROPORTION")
print("--------------------------------")
print (bd.to_string(index=True))


10 lowest-ranked countries in terms of number of politician articles as a proportion of country population

COUNTRY              PROPORTION
--------------------------------
Bangladesh            0.000202
China                 0.000083
Congo Dem. Rep. of    0.000194
Ethiopia              0.000107
India                 0.000075
Indonesia             0.000084
Korea North           0.000156
Thailand              0.000172
Uzbekistan            0.000093
Zambia                0.000168

In [22]:
# 10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles
# about politicians from that country
sorted_descending_articleQ_proportion = [(k, hqarticle_proportion[k]) for k in sorted(hqarticle_proportion, key=hqarticle_proportion.get, reverse=True)]
#for k, v in sorted_top_10_articleQ_proportion:
#      print(k, v)
        

sorted_top_10_articleQ_proportion={}
count =0
for k, v in sorted_descending_articleQ_proportion:
    count =count+1
    sorted_top_10_articleQ_proportion[k] =v
    if count==10:
        break
print(sorted_top_10_articleQ_proportion)


{'Korea North': 17.94871794871795, 'Uzbekistan': 10.344827586206897, 'Saudi Arabia': 10.084033613445378, 'Guinea-Bissau': 9.523809523809524, 'Bhutan': 9.090909090909092, 'Dominica': 8.333333333333332, 'Central African Republic': 7.352941176470589, 'Gambia': 7.317073170731707, 'South African Republic': 6.666666666666667, 'Abkhazia': 6.25}

In [40]:
cd = pd.Series(sorted_top_10_articleQ_proportion) 
cdata = {'Percentage ':cd}
cdf = pd.DataFrame(cdata)
#cdf = cdf.sort('propotion')
cdf.plot(kind='bar',figsize=(12,10), title='10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country' )
plt.xlabel('Country')
plt.ylabel('Percentage [High Quality (FA or GA)wikipedia Article/ Total wikipedia Article *100]')
plt.show()



In [24]:
print("10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country")
print("")
print("COUNTRY                      Percentage")
print("----------------------------------------")
print (cd.to_string(index=True))


10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

COUNTRY                      Percentage
----------------------------------------
Abkhazia                     6.250000
Bhutan                       9.090909
Central African Republic     7.352941
Dominica                     8.333333
Gambia                       7.317073
Guinea-Bissau                9.523810
Korea North                 17.948718
Saudi Arabia                10.084034
South African Republic       6.666667
Uzbekistan                  10.344828

In [25]:
# 10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles
# about politicians from that country
sorted_ascending_articleQ_proportion = [(k, hqarticle_proportion[k]) for k in sorted(hqarticle_proportion, key=hqarticle_proportion.get, reverse=False)]
#for k, v in sorted_bottom_10_articleQ_proportion:
      #print(k, v)
        
sorted_bottom_10_articleQ_proportion={}
count =0
for k, v in sorted_ascending_articleQ_proportion:
    count =count+1
    sorted_bottom_10_articleQ_proportion[k] =v
    if count==10:
        break
print(sorted_bottom_10_articleQ_proportion)


{'Tanzania': 0.24509803921568626, 'Peru': 0.2824858757062147, 'Czech Republic': 0.39370078740157477, 'Lithuania': 0.4032258064516129, 'Serbia': 0.45454545454545453, 'Norway': 0.4559270516717325, 'Mexico': 0.46253469010175763, 'Morocco': 0.4807692307692308, 'Italy': 0.4830917874396135, 'Hungary': 0.4885993485342019}

In [41]:
dd = pd.Series(sorted_bottom_10_articleQ_proportion) 
ddata = {'Percentage  ':dd}
ddf = pd.DataFrame(ddata)
ddf
ddf.plot(kind='bar' ,figsize=(12,10), title='10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country' )
plt.xlabel('Country')
plt.ylabel('Percentage [High Quality (FA or GA)wikipedia Article/ Total wikipedia Article *100]')
plt.show()



In [27]:
print("10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country")
print("")

print("COUNTRY         Percentage")
print("---------------------------")
print (dd.to_string(index=True))


10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country

COUNTRY         Percentage
---------------------------
Czech Republic    0.393701
Hungary           0.488599
Italy             0.483092
Lithuania         0.403226
Mexico            0.462535
Morocco           0.480769
Norway            0.455927
Peru              0.282486
Serbia            0.454545
Tanzania          0.245098

END

Below this block is Rough Work - Not for grading purpose .


In [28]:
Final_data = {'Top10':ad,
             'Bottom 10':bd,
              'Top10HQ':cd,
             'botton10HQ':dd}
Final_data_f = pd.DataFrame(Final_data)
Final_data_f


Out[28]:
Bottom 10 Top10 Top10HQ botton10HQ
Abkhazia NaN NaN 6.250000 NaN
Andorra NaN 0.043590 NaN NaN
Bangladesh 0.000202 NaN NaN NaN
Bhutan NaN NaN 9.090909 NaN
Central African Republic NaN NaN 7.352941 NaN
China 0.000083 NaN NaN NaN
Congo Dem. Rep. of 0.000194 NaN NaN NaN
Czech Republic NaN NaN NaN 0.393701
Dominica NaN NaN 8.333333 NaN
Ethiopia 0.000107 NaN NaN NaN
Federated States of Micronesia NaN 0.036893 NaN NaN
Gambia NaN NaN 7.317073 NaN
Guinea-Bissau NaN NaN 9.523810 NaN
Hungary NaN NaN NaN 0.488599
Iceland NaN 0.062268 NaN NaN
India 0.000075 NaN NaN NaN
Indonesia 0.000084 NaN NaN NaN
Italy NaN NaN NaN 0.483092
Korea North 0.000156 NaN 17.948718 NaN
Liechtenstein NaN 0.077189 NaN NaN
Lithuania NaN NaN NaN 0.403226
Marshall Islands NaN 0.067273 NaN NaN
Mexico NaN NaN NaN 0.462535
Monaco NaN 0.105020 NaN NaN
Morocco NaN NaN NaN 0.480769
Nauru NaN 0.488029 NaN NaN
Norway NaN NaN NaN 0.455927
Peru NaN NaN NaN 0.282486
San Marino NaN 0.248485 NaN NaN
Saudi Arabia NaN NaN 10.084034 NaN
Serbia NaN NaN NaN 0.454545
South African Republic NaN NaN 6.666667 NaN
Tanzania NaN NaN NaN 0.245098
Thailand 0.000172 NaN NaN NaN
Tonga NaN 0.060987 NaN NaN
Tuvalu NaN 0.466102 NaN NaN
Uzbekistan 0.000093 NaN 10.344828 NaN
Zambia 0.000168 NaN NaN NaN

In [29]:
Final_data_f.plot(figsize=(12, 10))


Out[29]:
<matplotlib.axes._subplots.AxesSubplot at 0x13fa9f1f748>

In [30]:
Final_data_f.plot(kind='barh', figsize=(12, 30))


Out[30]:
<matplotlib.axes._subplots.AxesSubplot at 0x13fa9f83b00>

In [31]:
fig, axes = plt.subplots(nrows=4, ncols=1)
for i, c in enumerate(Final_data_f.columns):
    Final_data_f[c].plot(kind='barh',ax=axes[i], figsize=(12,50), title=c)



In [33]:
#https://datasciencelab.wordpress.com/2013/12/21/beautiful-plots-with-pandas-and-matplotlib/
fig, axes = plt.subplots(nrows=4, ncols=1)
for i, c in enumerate(Final_data_f.columns):
    Final_data_f[c].plot(kind='bar', ax=axes[i], figsize=(12, 30), title=c)
#plt.savefig('All .png', bbox_inches='tight')