In [47]:
#TODO: words written by men and women
import csv
import numpy as np
import matplotlib.pyplot as plt
import pylab as P
import os
import cjson
import codecs
import pandas as pd
csv.field_size_limit(sys.maxsize)
lines = []
#CHOOSE A FOLDER FROM ABOVE
#folder = "./test_results/30bf9fd5bb824eb49e89c8a828276348c0b1570c/"
articles_folder = "./annual_data/"
files = ["[2013-08-01T00:00:00Z TO 2013-09-01T00:00:00Z].csv",
"[2013-09-01T00:00:00Z TO 2013-10-01T00:00:00Z].csv",
"[2013-10-01T00:00:00Z TO 2013-11-01T00:00:00Z].csv",
"[2013-11-01T00:00:00Z TO 2013-12-01T00:00:00Z].csv",
"[2013-12-01T00:00:00Z TO 2014-01-01T00:00:00Z].csv",
"[2014-01-01T00:00:00Z TO 2014-02-01T00:00:00Z].csv",
"[2014-02-01T00:00:00Z TO 2014-03-01T00:00:00Z].csv",
"[2014-03-01T00:00:00Z TO 2014-04-01T00:00:00Z].csv",
"[2014-04-01T00:00:00Z TO 2014-05-01T00:00:00Z].csv",
"[2014-05-01T00:00:00Z TO 2014-06-01T00:00:00Z].csv",
"[2014-06-01T00:00:00Z TO 2014-07-01T00:00:00Z].csv",
"[2014-07-01T00:00:00Z TO 2014-08-01T00:00:00Z].csv"]
topic_folders = [
"[2013-08-01T00:00:00Z TO 2013-09-01T00:00:00Z]",
"[2013-09-01T00:00:00Z TO 2013-10-01T00:00:00Z]",
"[2013-10-01T00:00:00Z TO 2013-11-01T00:00:00Z]",
"[2013-11-01T00:00:00Z TO 2013-12-01T00:00:00Z]",
"[2013-12-01T00%3A00%3A00Z TO 2014-01-01T00%3A00%3A00Z]",
"[2014-01-01T00:00:00Z TO 2014-02-01T00:00:00Z]",
"[2014-02-01T00%3A00%3A00Z TO 2014-03-01T00%3A00%3A00Z]",
"[2014-03-01T00:00:00Z TO 2014-04-01T00:00:00Z]",
"[2014-04-01T00:00:00Z TO 2014-05-01T00:00:00Z]",
"[2014-05-01T00:00:00Z TO 2014-06-01T00:00:00Z]",
"[2014-06-01T00:00:00Z TO 2014-07-01T00:00:00Z]",
"2014-07-01T00%3A00%3A00Z TO 2014-08-01T00%3A00%3A00Z"]
#top = 0
for index in range(0, len(files)):
# open up the articles CSV file for a given month
with open (os.path.join(articles_folder,files[index])) as f:
reader = csv.DictReader(f)
#find the corresponding month's opinion topic data
for file in os.listdir(os.path.join("./classification_results",topic_folders[index])):
if fnmatch.fnmatch(file, 'opeds*JSON'):
fd = codecs.open(os.path.join("./classification_results", topic_folders[index], file), 'r', encoding='utf8').read()
topic_data = cjson.decode(fd)
for i, row in enumerate(reader):
#print i
#print row['extractor.full_text'][0:20]
#print topic_data['topics'][i]['txt'][0:20]
#print "---"
row['topic']= topic_data['topics'][i]['classified_as'] #ignore header row in article CSV TODO: improve later
lines.append(row)
#pop the key row and set up the next top pop
#lines.pop(top)
#rows = len(lines) - top + 1
#top = len(lines)
#check topic_data and lines to see if merge is feasible
print( "articles: {0}, topic_data: {1}".format(rows,len(topic_data['topics'])))
lines[0].keys()
len(lines)
Out[47]:
In [48]:
# Previous code
#data = codecs.open('./Data/topic_oped_classification_results-sands_copy/2014-01-01_TO_2014-02-01/topics_file_2014-08-18_11_02_38.JSON', 'r', encoding='utf8').read()
#data = cjson.decode(data)
#topics = data['topics']
#print('Found %d topics in file.' % (len(topics)))
In [49]:
#import fnmatch
#for index in range(0,len(files)):
# for file in os.listdir(os.path.join("classification_results",topic_folders[index])):
# if fnmatch.fnmatch(file, 'opeds*JSON'):
# f = codecs.open(os.path.join("classification_results", topic_folders[index], file), 'r', encoding='utf8').read()
# data = cjson.decode(f)
# topics = data['topics']
# print('Found %d topics in file.' % (len(topics)))
In [49]:
In [50]:
#SUMMARIZE SECTION IDENTIFICATION
lines[0].keys()
MEDIA= {
'1': "new york times",
'2': "washington post",
'6':"la times",
'7': "new york post",
'1150': "wall street journal",
'1757': "salon",
'1707': "daily beast",
'1750': "telegraph",
'314' : "huffington post",
"27502":"huffington post" #assuming these are the same for now
}
media = {}
for line in lines:
mediakey = MEDIA[line['media_id']]
section = line['section']
if(not mediakey in media):
media[mediakey] = {}
if(not section in media[mediakey]):
media[mediakey][section] = 0
media[mediakey][section] += 1
for key in media.keys():
articles = 0
for section in media[key].keys():
articles += media[key][section]
print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
#for section in media[key].keys():
# if(not section is None):
# if(section.lower().find("opinion")>=0):
# print " {0}: {1}".format(section,media[key][section])
In [51]:
# GROUP BYLINES BY MEDIA ORGANISATION
# AND SUMMARIZE
media_people = {}
from byline_gender import BylineGender
b = BylineGender()
#for key in media.keys():
# articles = 0
# for section in media[key].keys():
# articles += media[key][section]
# print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
# for section in media[key].keys():
# if(section.lower().find("opinion")>=0):
# print " {0}: {1}".format(section,media[key][section])
sections = []
for line in lines:
mediakey = MEDIA[line['media_id']]
byline_text = line['byline']
# if it's an opinion article, add it to the dataset
if(not line['topic'] is None and line['topic'] == "oped"):
section = line['section']
if not section in sections:
sections.append(section)
for byline in b.get_full_names(byline_text):
if(not mediakey in media_people):
media_people[mediakey] = {}
if(not byline in media_people[mediakey]):
media_people[mediakey][byline] = 0
media_people[mediakey][byline] += 1
#print "---"
#print sections
#for key in media_people.keys():
# print "{0}: {1} bylines".format(key,len(media_people[key]))
In [52]:
for key in sort(media_people.keys()):
print "{0}: {1} bylines".format(key,len(media_people[key]))
values = media_people[key].values()
plt.hist(values, max(values))
plt.xlabel("Articles Published in {0}".format(key))
plt.ylabel('Number of Authors', fontsize= 20)
plt.show()
In [53]:
from byline_gender import BylineGender
b = BylineGender()
b.load_name_org_online()
print b.org_name_gender("washington post","editorial board")
unknown = []
known = []
for org in sort(media_people.keys()):
print "{0}: {1} bylines".format(org,len(media_people[org]))
vals = {"female":{},"male":{},"unknown":{}}
for name in media_people[org].keys():
#gender = b.single_name_gender(name)
gender = b.org_name_gender(org,name)
if(not gender == "ignore"):
vals[gender][name]=media_people[org][name]
if gender == "unknown":
unknown.append(name)
else:
known.append(name)
m = 0
for v in vals.values():
if(len(v) > 0 and max(v)>m):
m = max(v)
h = []
labels = []
for v in sort(vals.keys()):
labels.append(v)
h.append(vals[v].values())
if(len(h[-1]) == 0):
h[-1]=[0]
plt.figure()
n,bins,patches = plt.hist(h)
plt.xlabel("Articles Published in {0}".format(org))
plt.ylabel('Number of Authors', fontsize= 20)
legend(patches, labels)
plt.show()
print "UNKNOWN BYLINES: {0}".format(len(unknown))
print "GUESSED BYLINES: {0}".format(len(known))
In [54]:
#OUTPUT TO BYLINE FILE
#f = open('org_people_upload.csv', 'w')
#b.export_org_names(media_people,f)
#f.close()
In [56]:
def pct(a,b):
return 100*(float(a)/float(b))
pct_table = []
pct_table.append(["org","type","female","male","unknown"])
for org in sort(media_people.keys()):
print "{0}: {1} bylines".format(org,len(media_people[org]))
article_count = {"female":0,"male":0,"unknown":0}
people_count = {"female":0,"male":0,"unknown":0}
total = 0
nontotal = 0
people_total = 0
for name in media_people[org].keys():
#gender = b.single_name_gender(name)
gender = b.org_name_gender(org,name)
if(not gender == "ignore"):
article_count[gender]+= media_people[org][name]
people_count[gender] += 1
people_total += 1
total += media_people[org][name]
else:
nontotal += media_people[org][name]
#ARTICLE COUNT CHART
colors= '#78E678','#E8CA53',"#CCCCCC"
#print "NONTOTAL: {0}".format(nontotal)
pct_table.append(["people",org,people_count['female'],people_count['male'],people_count['unknown']])
pct_table.append(["article",org,article_count['female'],article_count['male'],article_count['unknown']])
#print "{0},{1},{2},{3},{4}".format("people",org,people_count['female'],people_count['male'],people_count['unknown'])
#print "{0},{1},{2},{3},{4}".format("article",org,article_count['female'],article_count['male'],article_count['unknown'])
P.figure(1, figsize=(6,6))
labels = 'female', 'male', 'unknown'
fracs = [pct(article_count['female'],total), pct(article_count['male'],total), pct(article_count['unknown'],total)]
explode=(0.06, 0, 0)
P.pie(fracs, explode=explode, colors=colors, labels=labels,
autopct='%1.1f%%')
P.title('Author Gender per Article in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
P.show()
#PEOPLE COUNT CHART
P.figure(1, figsize=(6,6))
labels = 'female', 'male', 'unknown'
fracs = [pct(people_count['female'],people_total), pct(people_count['male'],people_total), pct(people_count['unknown'],people_total)]
explode=(0.06, 0, 0)
P.pie(fracs, explode=explode, colors=colors, labels=labels,
autopct='%1.1f%%')
P.title('Unique Author Gender in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
P.show()
for l in pct_table:
print l
In [ ]:
b.org_name_gender("washington post","editorial board")
In [57]:
accuracy = {}
for line in lines:
mediakey = MEDIA[line['media_id']]
if mediakey not in accuracy.keys():
accuracy[mediakey]={"correct_included":0,"correct_excluded":0,
"incorrect_included":0,"incorrect_excluded":0}
section = line['section']
is_oped = (not line['topic'] is None and line['topic'] == "oped")
if(not section is None and section.lower().find("opinion")>=0):
if(is_oped):
accuracy[mediakey]['correct_included']+=1
else:
accuracy[mediakey]['incorrect_excluded']+=1
else:
if(is_oped):
accuracy[mediakey]['incorrect_included']+=1
else:
accuracy[mediakey]['correct_excluded']+=1
if ((accuracy[mediakey]['correct_included'] + accuracy[mediakey]['incorrect_included'] >0) and (accuracy[mediakey]['correct_included'] + accuracy[mediakey]['incorrect_excluded'] )):
accuracy[mediakey]['precision'] = float(accuracy[mediakey]['correct_included']) / (float(accuracy[mediakey]['correct_included']) + float(accuracy[mediakey]['incorrect_included']))
accuracy[mediakey]['recall'] = float(accuracy[mediakey]['correct_included']) / (float(accuracy[mediakey]['correct_included']) + float(accuracy[mediakey]['incorrect_excluded']))
else:
accuracy[mediakey]['precision'] = 0.
accuracy[mediakey]['recall'] = 0.
print "publication,precision, recall, correct_included,correct_excluded,incorrect_included,incorrect_excluded"
for mediakey in accuracy.keys():
print("{0},{1},{2},{3},{4},{5},{6}".format(mediakey,accuracy[mediakey]['precision'],accuracy[mediakey]['recall'],accuracy[mediakey]['correct_included'],accuracy[mediakey]['correct_excluded'],accuracy[mediakey]['incorrect_included'],accuracy[mediakey]['incorrect_excluded']))
In [ ]: