In [2]:
#TODO: words written by men and women
import csv
import numpy as np
import matplotlib.pyplot as plt
import pylab as P
import os
import dateutil
csv.field_size_limit(sys.maxsize)
lines = []
#CHOOSE A FOLDER FROM ABOVE
#folder = "./test_results/30bf9fd5bb824eb49e89c8a828276348c0b1570c/"
folder = "./annual_data/"
files = ["[2013-08-01T00:00:00Z TO 2013-09-01T00:00:00Z].csv",
"[2013-09-01T00:00:00Z TO 2013-10-01T00:00:00Z].csv",
"[2013-10-01T00:00:00Z TO 2013-11-01T00:00:00Z].csv",
"[2013-11-01T00:00:00Z TO 2013-12-01T00:00:00Z].csv",
"[2013-12-01T00:00:00Z TO 2014-01-01T00:00:00Z].csv",
"[2014-01-01T00:00:00Z TO 2014-02-01T00:00:00Z].csv",
"[2014-02-01T00:00:00Z TO 2014-03-01T00:00:00Z].csv",
"[2014-03-01T00:00:00Z TO 2014-04-01T00:00:00Z].csv",
"[2014-04-01T00:00:00Z TO 2014-05-01T00:00:00Z].csv",
"[2014-05-01T00:00:00Z TO 2014-06-01T00:00:00Z].csv",
"[2014-06-01T00:00:00Z TO 2014-07-01T00:00:00Z].csv",
"[2014-07-01T00:00:00Z TO 2014-08-01T00:00:00Z].csv"]
top = 0
for filename in files:
#with open (os.path.join(folder,"month_06_2014.csv")) as f:
with open (os.path.join(folder,filename)) as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
lines.append(row)
lines.pop(top)
top = len(lines)
lines[0].keys()
Out[2]:
In [3]:
#SUMMARIZE SECTION IDENTIFICATION
lines[0].keys()
MEDIA= {
'1': "new york times",
'2': "washington post",
'6':"la times",
'7': "new york post",
'1150': "wall street journal",
'1757': "salon",
'1707': "daily beast",
'1750': "telegraph",
'314' : "huffington post",
"27502":"huffington post" #assuming these are the same for now
}
media = {}
for line in lines:
mediakey = MEDIA[line['media_id']]
section = line['section']
if(not mediakey in media):
media[mediakey] = {}
if(not section in media[mediakey]):
media[mediakey][section] = 0
media[mediakey][section] += 1
for key in media.keys():
articles = 0
for section in media[key].keys():
articles += media[key][section]
print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
#for section in media[key].keys():
# if(not section is None and section.lower().find("opinion")>=0):
# print " {0}: {1}".format(section,media[key][section])
In [5]:
# GROUP BYLINES BY MEDIA ORGANISATION
# AND SUMMARIZE
media_people = {}
from byline_gender import BylineGender
b = BylineGender()
sections = []
count = 0
for line in lines:
mediakey = MEDIA[line['media_id']]
byline_text = line['byline']
if(not section is None and section.lower().find("opinion")>=0):
opinion = True
else:
opinion = False
section = line['section']
if not section in sections:
sections.append(section)
for byline in b.get_full_names(byline_text):
count += 1
if(not mediakey in media_people):
media_people[mediakey] = {}
if(not byline in media_people[mediakey]):
media_people[mediakey][byline] = {"dates":[],"count":0, 'opinion':False}
media_people[mediakey][byline]['count'] += 1
media_people[mediakey][byline]['dates'].append(line['publish_date'])
if(opinion):
media_people[mediakey][byline]['opinion'] = opinion
print "---"
#print sections
print "{0} bylines detected in {1} lines".format(count,len(lines))
for key in media_people.keys():
print "{0}: {1} bylines".format(key,len(media_people[key]))
In [6]:
#add posting frequency ratio to media_people, now that all lines have been processed
#posting frequency is the average of the intervals between posts
#org = 'huffington post'
import dateutil
for org in media_people.keys():
for p in media_people[org].keys():
dates = sort(media_people[org][p]['dates'])
if(len(dates)>1):
intervals = []
last_date = dates[0]
for i in range(1,len(dates)):
current_date = dates[i]
intervals.append((dateutil.parser.parse(current_date) - dateutil.parser.parse(last_date)).days)
last_date = current_date
avg_interval = np.mean(intervals)
#print "{0}: avg {1}, intervals: {2}".format(p, np.mean(intervals),intervals)
else:
avg_interval = 365.0
media_people[org][p]['interval'] = avg_interval
In [13]:
# REPORT precision and recall for a given threshhold
threshhold = 31.0
def op_by_freq(x):
return len(x['dates'])<=2 or x['interval']>threshhold
print "Using an interval threshhold of {0}:\n".format(threshhold)
for org in media_people.keys():
accurately_retrieved = [x['interval'] for x in media_people[org].values() if (x['opinion'] and op_by_freq(x))]
inaccurately_retrieved = [x['interval'] for x in media_people[org].values() if (x['opinion'] is False and op_by_freq(x))]
accurately_excluded = [x['interval'] for x in media_people[org].values() if (x['opinion'] is False and not op_by_freq(x))]
inaccurately_excluded = [x['interval'] for x in media_people[org].values() if (x['opinion'] and not op_by_freq(x))]
print "{0}".format(org)
print "{0} were accurately retrieved".format(len(accurately_retrieved))
print "{0} were inaccurately retrieved".format(len(inaccurately_retrieved))
print "{0} were accurately excluded".format(len(accurately_excluded))
print "{0} were inaccurately excluded".format(len(inaccurately_excluded))
print ""
In [7]:
#SHOW HISTOGRAMS of average posting interval
for org in media_people.keys():
values = [x['interval'] for x in media_people[org].values() if x['interval'] < 365.0]
threshhold = 31.0
print "{0}: {1}/{2} bylines posting with a >{3} interval".format(org,len([x for x in values if x>threshhold]),len(values),threshhold)
print "{0}: {2} mean interval. {1} median interval".format(org, np.mean(values),np.median(values))
plt.hist(values, max(values))
plt.xlabel("Posting intervals in Published in {0}".format(org))
plt.ylabel('Number of Authors', fontsize= 20)
plt.show()
In [8]:
for key in sort(media_people.keys()):
print "{0}: {1} bylines".format(key,len(media_people[key]))
values = [x['count'] for x in media_people[key].values()]
plt.hist(values, max(values))
plt.xlabel("Articles Published in {0}".format(key))
plt.ylabel('Number of Authors', fontsize= 20)
plt.show()
In [9]:
#OUTPUT TO BYLINE FILE
#mp = {}
#for org in media_people.keys():
# mp[org] = {}
# for name in media_people[org].keys():
# if(op_by_freq(media_people[org][name])):
# mp[org][name] = media_people[org][name]['count']
#
#f = open('interval_freq_org_people_upload.csv', 'w')
#b.export_org_names(mp,f)
#f.close()
In [14]:
from byline_gender import BylineGender
b = BylineGender()
b.load_name_org_online()
unknown = []
known = []
for org in media_people.keys():
print "{0}: {1} bylines".format(org,len(media_people[org]))
vals = {"female":{},"male":{},"unknown":{}}
for name in media_people[org].keys():
if(not op_by_freq(media_people[org][name])):
continue
#gender = b.single_name_gender(name)
gender = b.org_name_gender(org,name)
if(not gender in ["ignore"]):
vals[gender][name]=media_people[org][name]['count']
if gender is "unknown":
unknown.append(name)
else:
known.append(name)
m = 0
for v in vals.values():
if(len(v) > 0 and max(v)>m):
m = max(v)
h = []
labels = []
for v in sort(vals.keys()):
labels.append(v)
h.append(vals[v].values())
if(len(h[-1]) == 0):
h[-1]=[0]
plt.figure()
n,bins,patches = plt.hist(h)
plt.xlabel("Articles Published in {0}".format(org))
plt.ylabel('Number of Authors', fontsize= 20)
legend(patches, labels)
plt.show()
print "UNKNOWN BYLINES: {0}".format(len(unknown))
print "GUESSED BYLINES: {0}".format(len(known))
In [17]:
def pct(a,b):
return 100*(float(a)/float(b))
#['huffington post','salon'
for org in media_people.keys():
print "{0}: {1} bylines".format(org,len(media_people[org]))
article_count = {"female":0,"male":0,"unknown":0}
people_count = {"female":0,"male":0,"unknown":0}
total = 0
people_total = 0
for name in media_people[org].keys():
if(not op_by_freq(media_people[org][name])):
continue
gender = b.org_name_gender(org,name)
if(not gender in ["ignore"]):
article_count[gender]+= media_people[org][name]['count']
people_count[gender] += 1
people_total += 1
total += media_people[org][name]['count']
#ARTICLE COUNT CHART
colors= '#78E678','#E8CA53',"#CCCCCC"
P.figure(1, figsize=(6,6))
labels = 'female', 'male', 'unknown'
fracs = [pct(article_count['female'],total), pct(article_count['male'],total), pct(article_count['unknown'],total)]
explode=(0.06, 0, 0)
P.pie(fracs, explode=explode, colors=colors, labels=labels,
autopct='%1.1f%%')
P.title('Author Gender per Article in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
P.show()
#PEOPLE COUNT CHART
P.figure(1, figsize=(6,6))
labels = 'female', 'male', 'unknown'
fracs = [pct(people_count['female'],people_total), pct(people_count['male'],people_total), pct(people_count['unknown'],people_total)]
explode=(0.06, 0, 0)
P.pie(fracs, explode=explode, colors=colors, labels=labels,
autopct='%1.1f%%')
P.title('Unique Author Gender in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
P.show()
In [94]:
def pct(a,b):
return 100*(float(a)/float(b))
article_pcts = {}
ppl_pcts = {}
for org in media_people.keys():
article_pcts[org]={"male":[],"female":[],"unknown":[]}
ppl_pcts[org]={"male":[],"female":[],"unknown":[]}
#['huffington post','salon'
for org in media_people.keys():
print "{0}: {1} bylines".format(org,len(media_people[org]))
article_count = {"female":0,"male":0,"unknown":0}
people_count = {"female":0,"male":0,"unknown":0}
total = 0
people_total = 0
for name in media_people[org].keys():
if(not op_by_freq(media_people[org][name])):
continue
gender = b.org_name_gender(org,name)
if(not gender in ["ignore"]):
article_count[gender]+= media_people[org][name]['count']
people_count[gender] += 1
people_total += 1
total += media_people[org][name]['count']
#ARTICLE COUNT CHART
colors= '#78E678','#E8CA53',"#CCCCCC"
labels = 'female', 'male', 'unknown'
for label in labels:
article_pcts[org][label] = pct(article_count[label],total)
ppl_pcts[org][label] = pct(people_count[label],people_total)
In [102]:
print article_pcts['washington post']
print ppl_pcts['washington post']
In [116]:
ind = [0,1]
width = 0.55
for org in media_people.keys():
bottom_one = [article_pcts[org]['female'], ppl_pcts[org]['female']]
bottom_two = [article_pcts[org]['male'] + article_pcts[org]['female'],
ppl_pcts[org]['male'] + ppl_pcts[org]['female']]
female = [article_pcts[org]['female'],ppl_pcts[org]['female']]
male = [article_pcts[org]['male'],ppl_pcts[org]['male']]
unknown = [article_pcts[org]['unknown'],ppl_pcts[org]['unknown']]
fig = plt.figure()
fig.set_size_inches(3,5)
ax = fig.add_subplot(111)
p1 = ax.bar(ind, female, width, color="#78E678")
p2 = ax.bar(ind, male, width, bottom=bottom_one, color='#E8CA53')
p3 = ax.bar(ind, unknown, width,bottom=bottom_two, color='#cccccc')
plt.ylabel("Byline ratios by article")
plt.title("{0} gender ratio among infrequent posters".format(org))
#x = plt.xticks(ind)#, media_people.keys())
ax.set_xticks(ind)
ax.set_xticklabels(["article bylines", "contributors"], rotation=45, fontsize=10,ha='center')
plt.legend([p1, p2,p3], ["% female","% male","% unknown"],
bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()
In [117]:
#ind = range(0,len(ppl_pcts["female"]))
width = 0.35
unknownbottom = []
#for i in range(0,len(article_pcts['male'])):
# unknownbottom.append(article_pcts['male'][i]+article_pcts['female'][i])##
#fig = plt.figure()
#ax = fig.add_subplot(111)
#p1 = ax.bar(ind, article_pcts['female'], width, color="#78E678")
#p2 = ax.bar(ind, article_pcts['male'], width, bottom=article_pcts['female'], color='#E8CA53')
#p3 = ax.bar(ind, article_pcts['unknown'], width,bottom=unknownbottom, color='#cccccc')
#plt.ylabel("Byline ratios by article")
#plt.title('Gender ratio in infrequent bylines')
#x = plt.xticks(ind)#, media_people.keys())
#ax.set_xticks(ind)
#ax.set_xticklabels(media_people.keys(), rotation=45, fontsize=10,ha='right')
#plt.legend([p1, p2,p3], ["female","male","unknown"],
# bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#plt.show()
for org in media_people.keys():
print "{0},{1},{2},{3}".format(org,article_pcts[org]['female'],
article_pcts[org]['male'],
article_pcts[org]['unknown'])
In [ ]: