The goal for project is to explore the concept of 'bias' through data on Wikipedia articles .
Source :
* Wikipedia Data Set : Politicians by Country from the English-language Wikipedia [https://figshare.com/articles/Untitled_Item/5513449] : File Name - page_data.csv
1. "country", containing the sanitised country name, extracted from the category name;
2. "page", containing the unsanitised page title.
3. "last_edit", containing the edit ID of the last edit to the page.
* Population Data : http://www.prb.org/DataFinder/Topic/Rankings.aspx?ind=14 File Name - opulation Mid-2015.csv
* Article quality predictions : https://ores.wikimedia.org/v3/#!/scoring/get_v3_scores_context_revid_model
In [2]:
## getting the data from the CSV files
import csv
In [3]:
# Data Source 1 :
# reading data and saving in dictionary of dictionaries
'''
{
"revision_id": {
"country": "",
"article_name": "",
"revision_id":"",
},
"revision_id": {
},
...
}
'''
page_data = dict()
skip_lines = 1
with open('page_data.csv', encoding="utf8") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
#page_data.append([row[0],row[1],row[2]])
#['Template:ZambiaProvincialMinisters', 'Zambia', '235107991']
if(skip_lines!=1):
revision_id = row[2]
page_data[revision_id]=dict()
page_data[revision_id]["country"] = row[1]
page_data[revision_id]["article_name"] = row[0]
page_data[revision_id]["revision_id"] = row[2]
skip_lines = skip_lines+1
# print(page_data)
In [4]:
# Data Source 2 :
'''
{
"country_name_1":"population",
"country_name_1":"population"
}
'''
population_data = {}
# skip twolines from input csv file
skip_lines = 1
with open('Population Mid-2015.csv', encoding="utf8") as csvfile:
reader = csv.reader(csvfile)
for row in reader:
# last line in the raw data file is an empty line
# skip_lines > 2 skips first two lines
# row checks if the read list is empty
#print(row)
if(skip_lines>3 and row):
# ['Afghanistan', 'Country', 'Mid-2015', 'Number', '32,247,000', '']
population_data[row[0]] = row[4]
skip_lines = skip_lines + 1
print(population_data)
In [5]:
# Merging data set from wikipedia data(page_data) and population data(population_data)
# reading data from page_data and including poulation data
'''
{
"revision_id": {
"country": "",
"article_name": "",
"revision_id":"",
"population":""
},
"revision_id": {
},
...
}
'''
count=0
for key,value in page_data.items():
revision_id = key
#print(key)
country_page_data = value["country"]
#print(country_page_data)
if population_data.get(str(country_page_data)) is not None:
page_data[revision_id]["population"] = population_data[country_page_data]
else:
# for now setting population as 0 for countries which are not preset in population_data
page_data[revision_id]["population"] = 0
count = count+1
#print("no of entries which have popluation 0")
#print(count)
In [6]:
# Data Source 3 :
import requests
import json
headers = {'User-Agent' : 'https://github.com/abhishekanand', 'From' : 'anand1@uw.edu'}
def get_ores_data(revision_ids, headers):
# Define the endpoint
endpoint = 'https://ores.wikimedia.org/v3/scores/{project}/?models={model}&revids={revids}'
# Specify the parameters - smushing all the revision IDs together separated by | marks.
# Yes, 'smush' is a technical term, trust me I'm a scientist.
# What do you mean "but people trusting scientists regularly goes horribly wrong" who taught you tha- oh.
params = {'project' : 'enwiki',
'model' : 'wp10',
'revids' : '|'.join(str(x) for x in revision_ids)
}
api_call = requests.get(endpoint.format(**params))
response = api_call.json()
json.dumps(response, indent=4, sort_keys=True)
#print(response)
return response
# So if we grab some example revision IDs and turn them into a list and then call get_ores_data...
#example_ids = [783381498, 807355596, 757539710]
#get_ores_data(example_ids, headers)
#get_ores_data(example_ids, headers)
no_revision_ids = len(page_data)
#print(no_revision_ids)
# getting all the revision ids from page_data in list format
revision_ids = list(page_data.keys())
counter = 0
call_count = 0
# this contains revsion id as key and article quality as value
data_revision_quality = {}
# prediction is being added in page_data
'''
{
"revision_id": {
"country": "",
"article_name": "",
"revision_id":"",
"population":"",
"article_quality":""
},
"revision_id": {
},
...
}
'''
while(counter<no_revision_ids):
temp = get_ores_data(revision_ids[counter:counter+100], headers)
hundred_revisions = temp["enwiki"]["scores"]
for key, value in hundred_revisions.items():
revision = key
if value["wp10"].get("score") is not None:
prediction = value["wp10"]["score"]["prediction"]
page_data[revision]["article_quality"] = prediction
#print(page_data[revision])
else:
page_data[revision]["article_quality"] = 'NA'
counter = counter + 100
call_count = call_count + 1
if(counter>no_revision_ids):
temp = get_ores_data(revision_ids[counter-100:(counter-100)+no_revision_ids%100], headers)
left_revisions = temp["enwiki"]["scores"]
counter = counter-100
call_count = call_count + 1
for key, value in left_revisions.items():
revision = key
counter = counter + 1
if value["wp10"].get("score") is not None:
prediction = value["wp10"]["score"]["prediction"]
page_data[revision]["article_quality"] = prediction
#print(page_data[revision])
In [7]:
#print(counter) # Numbe of entries received from the API Call
#print(call_count)
#print(no_revision_ids%100)
In [8]:
# Cleaning Data Set to contain following values only
#country
#article_name
#revision_id
#article_quality
#population
combined_data = [] # Empty List
for key, value in page_data.items():
combined_data.append(value)
'''
printing first 100 revision ids because of
"OPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`."
'''
for i in range(100):
print(combined_data[i])
In [9]:
# The final data file should be named: en-wikipedia_traffic_200801-201709.csv
import csv
CSVOut = "bias_in_data.csv"
with open(CSVOut, "w",encoding="utf-8") as ofile:
ofile.write("country" + "," + "article_name" + "," + "revision_id"+"," + "article_quality"+"," + "population"+ "\n")
for item in combined_data:
#print(item["article_quality"])
#print(str(item["population"]).replace(',',''))
ofile.write(str(item["country"]).replace(',','') + "," + str(item["article_name"]).replace(',','') + "," + str(item["revision_id"])+"," + str(item["article_quality"])+"," + str(item["population"]).replace(',','') + "\n")
print("Done")
In [10]:
# Creating dataset Country and Article Count
'''
{
"country_name_1":"Count of Articles",
"country_name_1":"Count of Articles"
}
'''
country_articleCount = {}
for item in combined_data:
country_name = str(item["country"]).replace(',','')
if (country_name in country_articleCount.keys()):
if(item["article_name"]) is not None:
country_articleCount[country_name]=country_articleCount[country_name]+1
else:
country_articleCount[country_name]=0
if(item["article_name"]) is not None:
country_articleCount[country_name]=1
print(country_articleCount)
In [11]:
# Population Data
# print(population_data)
# Creating dataste containining Country and Population
'''
{
"country_name_1":"Population",
"country_name_1":"Population"
}
'''
country_population= {}
for key, value in population_data.items():
countryName = str(key).replace(',','')
country_population[countryName] = str(population_data[key]).replace(',','')
print(country_population)
In [12]:
# Creating Country : Proportion (Number of Article for politicians from a country /Country's Population )
# country_articleCount
# country_population
'''
{
"country_name_1":"Proportion",
"country_name_2":"Proportion"
}
'''
article_proportion = {}
for key, value in country_articleCount.items():
if (key in country_population.keys()):
article_proportion[key] = (int(country_articleCount[key])/int(country_population[key])*100)
print (article_proportion)
In [13]:
# Creating Creating Country : Proportion
# Combined Data
# country_hqarticle
# Creating dataste containining Country and Count of High Quality Articles
'''
{
"country_name_1":"Count High Quality Article",
"country_name_1":"Count High Quality Article"
}
'''
country_hqarticle = {}
for item in combined_data:
country_name = str(item["country"]).replace(',','')
#print(country_name)
if (country_name in country_hqarticle.keys()):
if(str(item["article_quality"]) is 'FA' or str(item["article_quality"]) == 'GA'):
country_hqarticle[country_name]=country_hqarticle[country_name]+1
else:
country_hqarticle[country_name]=0
print(country_hqarticle)
In [14]:
# Creating Country and High Quality Article proportion (Number of High quality Article /Country's Population )
# country_articleCount
# country_hqarticle
'''
{
"country_name_1":"HqProportion",
"country_name_2":"hqProportion"
}
'''
hqarticle_proportion = {}
for key, value in country_articleCount.items():
if (key in country_hqarticle.keys() and str(country_hqarticle[key]) != '0' ):
#print(country_hqarticle[key])
#print(country_articleCount[key])
hqarticle_proportion[key] = (int(country_hqarticle[key])/int(country_articleCount[key])*100)
#hqarticle_proportion[key]
print (hqarticle_proportion)
The visualization should be pretty straightforward. Produce four visualizations that show:
10 highest-ranked countries in terms of number of politician articles as a proportion of country population 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population 10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country 10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country
In [15]:
# 10 highest-ranked countries in terms of number of politician articles as a proportion of country population
import pandas as pd
from collections import OrderedDict
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
%matplotlib inline
In [16]:
# sorts article_proportion dictionary in descending order
sorted_descending_article_proportion = [(k, article_proportion[k]) for k in sorted(article_proportion, key=article_proportion.get, reverse=True)]
#for k, v in sorted_top_10_article_proportion:
# print(k, v)
sorted_top_10_article_proportion={}
count =0
for k, v in sorted_descending_article_proportion:
count =count+1
sorted_top_10_article_proportion[k] =v
if count==10:
break
print(sorted_top_10_article_proportion)
In [17]:
# https://stackoverflow.com/questions/18837262/convert-python-dict-into-a-dataframe
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html
ad= pd.Series(sorted_top_10_article_proportion)
adata = {'proportion ':ad}
adf = pd.DataFrame(adata)
adf
adf.plot(kind='bar',figsize=(12,10) )
plt.xlabel('Country')
plt.ylabel('Proportion [Total wikipedia Article/ Population]')
plt.title('10 highest-ranked countries in terms of number of politician articles as a proportion of country population ')
plt.show()
In [18]:
print("10 highest-ranked countries in terms of number of politician articles as a proportion of country population")
print("")
print("COUNTRY PROPORTION")
print("-------------------------------------------")
print (ad.to_string(index=True))
In [19]:
# 10 lowest-ranked countries in terms of number of politician articles as a proportion of country population
# sorts article_proportion dictionary in ascending order
sorted_ascending_article_proportion = [(k, article_proportion[k]) for k in sorted(article_proportion, key=article_proportion.get, reverse=False)]
#for k, v in sorted_bottom_10_article_proportion:
#print(k, v)
sorted_bottom_10_article_proportion={}
count =0
for k, v in sorted_ascending_article_proportion:
count =count+1
sorted_bottom_10_article_proportion[k] =v
if count==10:
break
print(sorted_bottom_10_article_proportion)
In [39]:
bd = pd.Series(sorted_bottom_10_article_proportion)
bdata = {'proportion ':bd}
bdf = pd.DataFrame(bdata)
bdf
bdf.plot(kind='bar',figsize=(12,10), title='10 lowest-ranked countries in terms of number of politician articles as a proportion of country population' )
plt.xlabel('Country')
plt.ylabel('Proportion [Total wikipedia Article/ Population]')
plt.show()
In [21]:
print("10 lowest-ranked countries in terms of number of politician articles as a proportion of country population")
print("")
print("COUNTRY PROPORTION")
print("--------------------------------")
print (bd.to_string(index=True))
In [22]:
# 10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles
# about politicians from that country
sorted_descending_articleQ_proportion = [(k, hqarticle_proportion[k]) for k in sorted(hqarticle_proportion, key=hqarticle_proportion.get, reverse=True)]
#for k, v in sorted_top_10_articleQ_proportion:
# print(k, v)
sorted_top_10_articleQ_proportion={}
count =0
for k, v in sorted_descending_articleQ_proportion:
count =count+1
sorted_top_10_articleQ_proportion[k] =v
if count==10:
break
print(sorted_top_10_articleQ_proportion)
In [40]:
cd = pd.Series(sorted_top_10_articleQ_proportion)
cdata = {'Percentage ':cd}
cdf = pd.DataFrame(cdata)
#cdf = cdf.sort('propotion')
cdf.plot(kind='bar',figsize=(12,10), title='10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country' )
plt.xlabel('Country')
plt.ylabel('Percentage [High Quality (FA or GA)wikipedia Article/ Total wikipedia Article *100]')
plt.show()
In [24]:
print("10 highest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country")
print("")
print("COUNTRY Percentage")
print("----------------------------------------")
print (cd.to_string(index=True))
In [25]:
# 10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles
# about politicians from that country
sorted_ascending_articleQ_proportion = [(k, hqarticle_proportion[k]) for k in sorted(hqarticle_proportion, key=hqarticle_proportion.get, reverse=False)]
#for k, v in sorted_bottom_10_articleQ_proportion:
#print(k, v)
sorted_bottom_10_articleQ_proportion={}
count =0
for k, v in sorted_ascending_articleQ_proportion:
count =count+1
sorted_bottom_10_articleQ_proportion[k] =v
if count==10:
break
print(sorted_bottom_10_articleQ_proportion)
In [41]:
dd = pd.Series(sorted_bottom_10_articleQ_proportion)
ddata = {'Percentage ':dd}
ddf = pd.DataFrame(ddata)
ddf
ddf.plot(kind='bar' ,figsize=(12,10), title='10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country' )
plt.xlabel('Country')
plt.ylabel('Percentage [High Quality (FA or GA)wikipedia Article/ Total wikipedia Article *100]')
plt.show()
In [27]:
print("10 lowest-ranked countries in terms of number of GA and FA-quality articles as a proportion of all articles about politicians from that country")
print("")
print("COUNTRY Percentage")
print("---------------------------")
print (dd.to_string(index=True))
In [28]:
Final_data = {'Top10':ad,
'Bottom 10':bd,
'Top10HQ':cd,
'botton10HQ':dd}
Final_data_f = pd.DataFrame(Final_data)
Final_data_f
Out[28]:
In [29]:
Final_data_f.plot(figsize=(12, 10))
Out[29]:
In [30]:
Final_data_f.plot(kind='barh', figsize=(12, 30))
Out[30]:
In [31]:
fig, axes = plt.subplots(nrows=4, ncols=1)
for i, c in enumerate(Final_data_f.columns):
Final_data_f[c].plot(kind='barh',ax=axes[i], figsize=(12,50), title=c)
In [33]:
#https://datasciencelab.wordpress.com/2013/12/21/beautiful-plots-with-pandas-and-matplotlib/
fig, axes = plt.subplots(nrows=4, ncols=1)
for i, c in enumerate(Final_data_f.columns):
Final_data_f[c].plot(kind='bar', ax=axes[i], figsize=(12, 30), title=c)
#plt.savefig('All .png', bbox_inches='tight')