In [2]:
#TODO: words written by men and women

import csv
import numpy as np
import matplotlib.pyplot as plt
import pylab as P
import os
import dateutil

csv.field_size_limit(sys.maxsize)
lines = []

#CHOOSE A FOLDER FROM ABOVE
#folder = "./test_results/30bf9fd5bb824eb49e89c8a828276348c0b1570c/"
folder = "./annual_data/"

files = ["[2013-08-01T00:00:00Z TO 2013-09-01T00:00:00Z].csv",
"[2013-09-01T00:00:00Z TO 2013-10-01T00:00:00Z].csv",
"[2013-10-01T00:00:00Z TO 2013-11-01T00:00:00Z].csv",
"[2013-11-01T00:00:00Z TO 2013-12-01T00:00:00Z].csv",
"[2013-12-01T00:00:00Z TO 2014-01-01T00:00:00Z].csv",
"[2014-01-01T00:00:00Z TO 2014-02-01T00:00:00Z].csv",
"[2014-02-01T00:00:00Z TO 2014-03-01T00:00:00Z].csv",
"[2014-03-01T00:00:00Z TO 2014-04-01T00:00:00Z].csv",
"[2014-04-01T00:00:00Z TO 2014-05-01T00:00:00Z].csv",
"[2014-05-01T00:00:00Z TO 2014-06-01T00:00:00Z].csv",
"[2014-06-01T00:00:00Z TO 2014-07-01T00:00:00Z].csv",
"[2014-07-01T00:00:00Z TO 2014-08-01T00:00:00Z].csv"]

top = 0
for filename in files:
    #with open (os.path.join(folder,"month_06_2014.csv")) as f:
    with open (os.path.join(folder,filename)) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            lines.append(row)
    lines.pop(top)
    top = len(lines)

lines[0].keys()


Out[2]:
["byline_gender['female']",
 'byline',
 'media_id',
 "byline_gender['unknown']",
 'url',
 'section',
 'is_opinion',
 'publish_date',
 "byline_gender['male']",
 'extractor.full_text',
 'stories_id']

In [3]:
#SUMMARIZE SECTION IDENTIFICATION
lines[0].keys()

MEDIA= {
  '1': "new york times",
  '2': "washington post",
  '6':"la times",
  '7': "new york post",
  '1150': "wall street journal",
  '1757': "salon",
  '1707': "daily beast",
  '1750': "telegraph",
  '314' : "huffington post",
"27502":"huffington post" #assuming these are the same for now
}

media = {}

for line in lines:
    mediakey = MEDIA[line['media_id']]
    section = line['section']
    if(not mediakey in media):
        media[mediakey] = {}
    if(not section in media[mediakey]):
        media[mediakey][section] = 0
    media[mediakey][section] += 1
        
        
for key in media.keys():
    articles = 0
    for section in media[key].keys():
        articles += media[key][section]
    print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
    #for section in media[key].keys():
    #    if(not section is None and section.lower().find("opinion")>=0):
    #        print "    {0}: {1}".format(section,media[key][section])


salon: 1 sections, 4856 articles
huffington post: 2 sections, 152765 articles
washington post: 918 sections, 12135 articles
la times: 67 sections, 51766 articles
new york post: 18 sections, 46237 articles
new york times: 129 sections, 89820 articles
wall street journal: 41 sections, 72556 articles

In [5]:
# GROUP BYLINES BY MEDIA ORGANISATION
# AND SUMMARIZE
media_people = {}

from byline_gender import BylineGender
b = BylineGender()

sections = []
count = 0
for line in lines:
    mediakey = MEDIA[line['media_id']]
    byline_text = line['byline']

    if(not section is None and section.lower().find("opinion")>=0):
        opinion = True
    else:
        opinion = False

    section = line['section']
    if not section in sections:
        sections.append(section)
    for byline in b.get_full_names(byline_text):
        count += 1
        if(not mediakey in media_people):
            media_people[mediakey] = {}
        if(not byline in media_people[mediakey]):
            media_people[mediakey][byline] = {"dates":[],"count":0, 'opinion':False}
        media_people[mediakey][byline]['count'] += 1
        media_people[mediakey][byline]['dates'].append(line['publish_date'])
        if(opinion):
            media_people[mediakey][byline]['opinion'] = opinion
            
print "---"
#print sections
print "{0} bylines detected in {1} lines".format(count,len(lines))
for key in media_people.keys():
    print "{0}: {1} bylines".format(key,len(media_people[key]))


---
385621 bylines detected in 430135 lines
salon: 914 bylines
huffington post: 17068 bylines
washington post: 910 bylines
la times: 1895 bylines
new york post: 522 bylines
new york times: 5110 bylines
wall street journal: 3457 bylines

In [6]:
#add posting frequency ratio to media_people, now that all lines have been processed
#posting frequency is the average of the intervals between posts
#org = 'huffington post'
import dateutil
for org in media_people.keys():
    for p in media_people[org].keys():
        dates = sort(media_people[org][p]['dates'])
        if(len(dates)>1):
            intervals = []
            last_date = dates[0]
            for i in range(1,len(dates)):
                current_date = dates[i]
                intervals.append((dateutil.parser.parse(current_date) - dateutil.parser.parse(last_date)).days)
                last_date = current_date
            avg_interval = np.mean(intervals)
            #print "{0}: avg {1}, intervals: {2}".format(p, np.mean(intervals),intervals)
        else:
            avg_interval = 365.0

        media_people[org][p]['interval'] = avg_interval

In [13]:
# REPORT precision and recall for a given threshhold
threshhold = 31.0
def op_by_freq(x):
     return len(x['dates'])<=2 or x['interval']>threshhold

print "Using an interval threshhold of {0}:\n".format(threshhold)
for org in media_people.keys():
    accurately_retrieved = [x['interval'] for x in media_people[org].values() if (x['opinion'] and op_by_freq(x))]
    inaccurately_retrieved = [x['interval'] for x in media_people[org].values() if (x['opinion'] is False and op_by_freq(x))]
    accurately_excluded = [x['interval'] for x in media_people[org].values() if (x['opinion'] is False and not op_by_freq(x))]
    inaccurately_excluded = [x['interval'] for x in media_people[org].values() if (x['opinion'] and not op_by_freq(x))]
    print "{0}".format(org)
    print "{0} were accurately retrieved".format(len(accurately_retrieved))
    print "{0} were inaccurately retrieved".format(len(inaccurately_retrieved))
    print "{0} were accurately excluded".format(len(accurately_excluded))
    print "{0} were inaccurately excluded".format(len(inaccurately_excluded))
    print ""


Using an interval threshhold of 31.0:

salon
31 were accurately retrieved
745 were inaccurately retrieved
104 were accurately excluded
34 were inaccurately excluded

huffington post
637 were accurately retrieved
13285 were inaccurately retrieved
2292 were accurately excluded
854 were inaccurately excluded

washington post
60 were accurately retrieved
543 were inaccurately retrieved
143 were accurately excluded
164 were inaccurately excluded

la times
347 were accurately retrieved
896 were inaccurately retrieved
223 were accurately excluded
429 were inaccurately excluded

new york post
40 were accurately retrieved
220 were inaccurately retrieved
88 were accurately excluded
174 were inaccurately excluded

new york times
443 were accurately retrieved
3562 were inaccurately retrieved
395 were accurately excluded
710 were inaccurately excluded

wall street journal
251 were accurately retrieved
1966 were inaccurately retrieved
479 were accurately excluded
761 were inaccurately excluded


In [7]:
#SHOW HISTOGRAMS of average posting interval
for org in media_people.keys():
    values = [x['interval'] for x in media_people[org].values() if x['interval'] < 365.0]
    threshhold = 31.0
    print "{0}: {1}/{2} bylines posting with a >{3} interval".format(org,len([x for x in values if x>threshhold]),len(values),threshhold)
    print "{0}: {2} mean interval. {1} median interval".format(org, np.mean(values),np.median(values))
    plt.hist(values, max(values))
    plt.xlabel("Posting intervals in Published in {0}".format(org))
    plt.ylabel('Number of Authors', fontsize= 20)
    plt.show()


salon: 82/269 bylines posting with a >31.0 interval
salon: 18.0 mean interval. 29.2056073943 median interval
huffington post: 4766/9430 bylines posting with a >31.0 interval
huffington post: 31.7071428571 mean interval. 48.8694474992 median interval
washington post: 109/462 bylines posting with a >31.0 interval
washington post: 12.4672727273 mean interval. 25.6156144957 median interval
la times: 205/1013 bylines posting with a >31.0 interval
la times: 6.0 mean interval. 23.0811251596 median interval
new york post: 21/316 bylines posting with a >31.0 interval
new york post: 3.85357142857 mean interval. 8.65898988544 median interval
new york times: 937/2202 bylines posting with a >31.0 interval
new york times: 22.1493055556 mean interval. 47.2947768703 median interval
wall street journal: 575/1941 bylines posting with a >31.0 interval
wall street journal: 13.1818181818 mean interval. 32.5008195257 median interval

In [8]:
for key in sort(media_people.keys()):
    print "{0}: {1} bylines".format(key,len(media_people[key]))
    values = [x['count'] for x in media_people[key].values()]
    plt.hist(values, max(values))
    plt.xlabel("Articles Published in {0}".format(key))
    plt.ylabel('Number of Authors', fontsize= 20)
    plt.show()


huffington post: 17068 bylines
la times: 1895 bylines
new york post: 522 bylines
new york times: 5110 bylines
salon: 914 bylines
wall street journal: 3457 bylines
washington post: 910 bylines

In [9]:
#OUTPUT TO BYLINE FILE
#mp = {}
#for org in media_people.keys():
#    mp[org] = {}
#    for name in media_people[org].keys():
#        if(op_by_freq(media_people[org][name])):
#            mp[org][name] = media_people[org][name]['count']
#
#f = open('interval_freq_org_people_upload.csv', 'w')
#b.export_org_names(mp,f)
#f.close()

In [14]:
from byline_gender import BylineGender
b = BylineGender()
b.load_name_org_online()
unknown = []
known = []

for org in media_people.keys():
    print "{0}: {1} bylines".format(org,len(media_people[org]))
    vals = {"female":{},"male":{},"unknown":{}}
    for name in media_people[org].keys():
        if(not op_by_freq(media_people[org][name])):
            continue
        #gender = b.single_name_gender(name)
        gender = b.org_name_gender(org,name)
        if(not gender in ["ignore"]):
            vals[gender][name]=media_people[org][name]['count']
            if gender is "unknown":
                unknown.append(name)
            else:
                known.append(name)    
    m = 0
    for v in vals.values():
        if(len(v) > 0 and max(v)>m):
            m = max(v)
    
    h = []
    labels = []
    for v in sort(vals.keys()):
        labels.append(v)
        h.append(vals[v].values())
        if(len(h[-1]) == 0):
            h[-1]=[0]
    plt.figure()    
    n,bins,patches = plt.hist(h)
    plt.xlabel("Articles Published in {0}".format(org))
    plt.ylabel('Number of Authors', fontsize= 20)
    legend(patches, labels)
    plt.show()

print "UNKNOWN BYLINES: {0}".format(len(unknown))
print "GUESSED BYLINES: {0}".format(len(known))


salon: 914 bylines
huffington post: 17068 bylines
washington post: 910 bylines
la times: 1895 bylines
new york post: 522 bylines
new york times: 5110 bylines
wall street journal: 3457 bylines
UNKNOWN BYLINES: 5838
GUESSED BYLINES: 17184

In [17]:
def pct(a,b):
    return 100*(float(a)/float(b))

#['huffington post','salon'
for org in media_people.keys():
    print "{0}: {1} bylines".format(org,len(media_people[org]))
    article_count = {"female":0,"male":0,"unknown":0}
    people_count = {"female":0,"male":0,"unknown":0}
    
    total = 0
    people_total = 0
    for name in media_people[org].keys():
        if(not op_by_freq(media_people[org][name])):
            continue        
        gender = b.org_name_gender(org,name)
        if(not gender in ["ignore"]):
            article_count[gender]+= media_people[org][name]['count']
            people_count[gender] += 1
            people_total += 1
            total += media_people[org][name]['count'] 

    #ARTICLE COUNT CHART
    colors= '#78E678','#E8CA53',"#CCCCCC"

    P.figure(1, figsize=(6,6))
    labels = 'female', 'male', 'unknown'
    fracs = [pct(article_count['female'],total), pct(article_count['male'],total), pct(article_count['unknown'],total)]
    explode=(0.06, 0, 0)
    P.pie(fracs, explode=explode, colors=colors, labels=labels,
                    autopct='%1.1f%%')
    P.title('Author Gender per Article in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
    P.show()
    
    #PEOPLE COUNT CHART
    P.figure(1, figsize=(6,6))
    labels = 'female', 'male', 'unknown'
    fracs = [pct(people_count['female'],people_total), pct(people_count['male'],people_total), pct(people_count['unknown'],people_total)]
    explode=(0.06, 0, 0)
    P.pie(fracs, explode=explode, colors=colors, labels=labels,
                    autopct='%1.1f%%')
    P.title('Unique Author Gender in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
    P.show()


salon: 914 bylines
huffington post: 17068 bylines
washington post: 910 bylines
la times: 1895 bylines
new york post: 522 bylines
new york times: 5110 bylines
wall street journal: 3457 bylines

In [94]:
def pct(a,b):
    return 100*(float(a)/float(b))


article_pcts = {}
ppl_pcts = {}
for org in media_people.keys():
    article_pcts[org]={"male":[],"female":[],"unknown":[]}
    ppl_pcts[org]={"male":[],"female":[],"unknown":[]}


#['huffington post','salon'
for org in media_people.keys():
    print "{0}: {1} bylines".format(org,len(media_people[org]))
    article_count = {"female":0,"male":0,"unknown":0}
    people_count = {"female":0,"male":0,"unknown":0}
    
    total = 0
    people_total = 0
    for name in media_people[org].keys():
        if(not op_by_freq(media_people[org][name])):
            continue        
        gender = b.org_name_gender(org,name)
        if(not gender in ["ignore"]):
            article_count[gender]+= media_people[org][name]['count']
            people_count[gender] += 1
            people_total += 1
            total += media_people[org][name]['count'] 

    #ARTICLE COUNT CHART
    colors= '#78E678','#E8CA53',"#CCCCCC"

    labels = 'female', 'male', 'unknown'
    for label in labels:
        article_pcts[org][label] = pct(article_count[label],total)
        ppl_pcts[org][label] = pct(people_count[label],people_total)


salon: 914 bylines
huffington post: 17068 bylines
washington post: 910 bylines
la times: 1895 bylines
new york post: 522 bylines
new york times: 5110 bylines
wall street journal: 3457 bylines

In [102]:
print article_pcts['washington post']
print ppl_pcts['washington post']


{'unknown': 21.809045226130653, 'male': 45.32663316582914, 'female': 32.8643216080402}
{'unknown': 22.056384742951906, 'male': 47.2636815920398, 'female': 30.679933665008292}

In [116]:
ind = [0,1]
width = 0.55

for org in media_people.keys():

    bottom_one = [article_pcts[org]['female'], ppl_pcts[org]['female']]

    bottom_two = [article_pcts[org]['male'] + article_pcts[org]['female'],
                     ppl_pcts[org]['male'] + ppl_pcts[org]['female']]
    
    female = [article_pcts[org]['female'],ppl_pcts[org]['female']]
    male = [article_pcts[org]['male'],ppl_pcts[org]['male']]
    unknown = [article_pcts[org]['unknown'],ppl_pcts[org]['unknown']]

    fig = plt.figure()   
    fig.set_size_inches(3,5)
    ax = fig.add_subplot(111)
    p1 = ax.bar(ind, female,   width, color="#78E678")
    p2 = ax.bar(ind, male,  width, bottom=bottom_one, color='#E8CA53')
    p3 = ax.bar(ind, unknown, width,bottom=bottom_two, color='#cccccc')
    plt.ylabel("Byline ratios by article")
    plt.title("{0} gender ratio among infrequent posters".format(org))

    #x = plt.xticks(ind)#, media_people.keys())
    ax.set_xticks(ind)
    ax.set_xticklabels(["article bylines", "contributors"], rotation=45, fontsize=10,ha='center')
    plt.legend([p1, p2,p3], ["% female","% male","% unknown"],
               bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()



In [117]:
#ind = range(0,len(ppl_pcts["female"]))
width = 0.35

unknownbottom = []
    
#for i in range(0,len(article_pcts['male'])):
#    unknownbottom.append(article_pcts['male'][i]+article_pcts['female'][i])##

#fig = plt.figure()   
#ax = fig.add_subplot(111)
#p1 = ax.bar(ind, article_pcts['female'],   width, color="#78E678")
#p2 = ax.bar(ind, article_pcts['male'],  width, bottom=article_pcts['female'], color='#E8CA53')
#p3 = ax.bar(ind, article_pcts['unknown'], width,bottom=unknownbottom, color='#cccccc')
#plt.ylabel("Byline ratios by article")
#plt.title('Gender ratio in infrequent bylines')

#x = plt.xticks(ind)#, media_people.keys())
#ax.set_xticks(ind)
#ax.set_xticklabels(media_people.keys(), rotation=45, fontsize=10,ha='right')
#plt.legend([p1, p2,p3], ["female","male","unknown"],
#           bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
#plt.show()


for org in media_people.keys():
    print "{0},{1},{2},{3}".format(org,article_pcts[org]['female'],
                                   article_pcts[org]['male'],
                                   article_pcts[org]['unknown'])


salon,32.7217125382,44.750254842,22.5280326198
huffington post,34.6546024013,37.0802157647,28.265181834
washington post,32.864321608,45.3266331658,21.8090452261
la times,27.4691358025,52.2633744856,20.2674897119
new york post,30.8176100629,52.2012578616,16.9811320755
new york times,34.450901505,46.8037550291,18.745343466
wall street journal,23.4174085064,55.68743818,20.8951533136

In [ ]: