notebook.community

Edit and run



In [16]:

    
#TODO: words written by men and women

import csv
import numpy as np
import matplotlib.pyplot as plt
import pylab as P
import os
import dateutil

csv.field_size_limit(sys.maxsize)
lines = []

#CHOOSE A FOLDER FROM ABOVE
#folder = "./test_results/30bf9fd5bb824eb49e89c8a828276348c0b1570c/"
folder = "./annual_data/"

files = ["[2013-08-01T00:00:00Z TO 2013-09-01T00:00:00Z].csv",
"[2013-09-01T00:00:00Z TO 2013-10-01T00:00:00Z].csv",
"[2013-10-01T00:00:00Z TO 2013-11-01T00:00:00Z].csv",
"[2013-11-01T00:00:00Z TO 2013-12-01T00:00:00Z].csv",
"[2013-12-01T00:00:00Z TO 2014-01-01T00:00:00Z].csv",
"[2014-01-01T00:00:00Z TO 2014-02-01T00:00:00Z].csv",
"[2014-02-01T00:00:00Z TO 2014-03-01T00:00:00Z].csv",
"[2014-03-01T00:00:00Z TO 2014-04-01T00:00:00Z].csv",
"[2014-04-01T00:00:00Z TO 2014-05-01T00:00:00Z].csv",
"[2014-05-01T00:00:00Z TO 2014-06-01T00:00:00Z].csv",
"[2014-06-01T00:00:00Z TO 2014-07-01T00:00:00Z].csv",
"[2014-07-01T00:00:00Z TO 2014-08-01T00:00:00Z].csv"]

top = 0
for filename in files:
    #with open (os.path.join(folder,"month_06_2014.csv")) as f:
    with open (os.path.join(folder,filename)) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            lines.append(row)
    lines.pop(top)
    top = len(lines)

lines[0].keys()









    Out[16]:





["byline_gender['female']",
 'byline',
 'media_id',
 "byline_gender['unknown']",
 'url',
 'section',
 'is_opinion',
 'publish_date',
 "byline_gender['male']",
 'extractor.full_text',
 'stories_id']



In [17]:

    
#SUMMARIZE SECTION IDENTIFICATION
lines[0].keys()

MEDIA= {
  '1': "new york times",
  '2': "washington post",
  '6':"la times",
  '7': "new york post",
  '1150': "wall street journal",
  '1757': "salon",
  '1707': "daily beast",
  '1750': "telegraph",
  '314' : "huffington post",
"27502":"huffington post" #assuming these are the same for now
}

media = {}

for line in lines:
    mediakey = MEDIA[line['media_id']]
    section = line['section']
    if(not mediakey in media):
        media[mediakey] = {}
    if(not section in media[mediakey]):
        media[mediakey][section] = 0
    media[mediakey][section] += 1
        
        
for key in media.keys():
    articles = 0
    for section in media[key].keys():
        articles += media[key][section]
    print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
    #for section in media[key].keys():
    #    if(not section is None and section.lower().find("opinion")>=0):
    #        print "    {0}: {1}".format(section,media[key][section])









    



salon: 1 sections, 4856 articles
huffington post: 2 sections, 152765 articles
washington post: 918 sections, 12135 articles
la times: 67 sections, 51766 articles
new york post: 18 sections, 46237 articles
new york times: 129 sections, 89820 articles
wall street journal: 41 sections, 72556 articles



In [20]:

    
# GROUP BYLINES BY MEDIA ORGANISATION
# AND SUMMARIZE
media_people = {}

from byline_gender import BylineGender
b = BylineGender()

sections = []
count = 0
for line in lines:
    mediakey = MEDIA[line['media_id']]
    byline_text = line['byline']
    section = line['section']

    if(not section is None and section.lower().find("opinion")>=0):
        opinion = True
    else:
        opinion = False
        
    if(opinion == False): #TEMPORARY ADDITION TO FOCUS ON OPINION WRITING
        continue

    section = line['section']
    if not section in sections:
        sections.append(section)
    for byline in b.get_full_names(byline_text):
        count += 1
        if(not mediakey in media_people):
            media_people[mediakey] = {}
        if(not byline in media_people[mediakey]):
            media_people[mediakey][byline] = {"dates":[],"count":0, 'opinion':False}
        media_people[mediakey][byline]['count'] += 1
        media_people[mediakey][byline]['dates'].append(line['publish_date'])
        if(opinion):
            media_people[mediakey][byline]['opinion'] = opinion
            
print "---"
#print sections
print "{0} bylines detected in {1} lines".format(count,len(lines))
for key in media_people.keys():
    print "{0}: {1} bylines".format(key,len(media_people[key]))









    



---
13742 bylines detected in 430135 lines
washington post: 192 bylines
new york post: 116 bylines
new york times: 1445 bylines
wall street journal: 559 bylines
la times: 918 bylines



In [21]:

    
#add posting frequency ratio to media_people, now that all lines have been processed
#posting frequency is the average of the intervals between posts
#org = 'huffington post'
import dateutil
for org in media_people.keys():
    for p in media_people[org].keys():
        dates = sort(media_people[org][p]['dates'])
        if(len(dates)>1):
            intervals = []
            last_date = dates[0]
            for i in range(1,len(dates)):
                current_date = dates[i]
                intervals.append((dateutil.parser.parse(current_date) - dateutil.parser.parse(last_date)).days)
                last_date = current_date
            avg_interval = np.mean(intervals)
            #print "{0}: avg {1}, intervals: {2}".format(p, np.mean(intervals),intervals)
        else:
            avg_interval = 365.0

        media_people[org][p]['interval'] = avg_interval



In [22]:

    
# REPORT precision and recall for a given threshhold
threshhold = 31.0
def op_by_freq(x):
     return len(x['dates'])<=2 or x['interval']>threshhold

print "Using an interval threshhold of {0}:\n".format(threshhold)
for org in media_people.keys():
    accurately_retrieved = [x['interval'] for x in media_people[org].values() if (x['opinion'] and op_by_freq(x))]
    inaccurately_retrieved = [x['interval'] for x in media_people[org].values() if (x['opinion'] is False and op_by_freq(x))]
    accurately_excluded = [x['interval'] for x in media_people[org].values() if (x['opinion'] is False and not op_by_freq(x))]
    inaccurately_excluded = [x['interval'] for x in media_people[org].values() if (x['opinion'] and not op_by_freq(x))]
    print "{0}".format(org)
    print "{0} were accurately retrieved".format(len(accurately_retrieved))
    print "{0} were inaccurately retrieved".format(len(inaccurately_retrieved))
    print "{0} were accurately excluded".format(len(accurately_excluded))
    print "{0} were inaccurately excluded".format(len(inaccurately_excluded))
    print ""









    



Using an interval threshhold of 31.0:

washington post
139 were accurately retrieved
0 were inaccurately retrieved
0 were accurately excluded
53 were inaccurately excluded

new york post
92 were accurately retrieved
0 were inaccurately retrieved
0 were accurately excluded
24 were inaccurately excluded

new york times
1394 were accurately retrieved
0 were inaccurately retrieved
0 were accurately excluded
51 were inaccurately excluded

wall street journal
533 were accurately retrieved
0 were inaccurately retrieved
0 were accurately excluded
26 were inaccurately excluded

la times
785 were accurately retrieved
0 were inaccurately retrieved
0 were accurately excluded
133 were inaccurately excluded



In [23]:

    
#SHOW HISTOGRAMS of average posting interval
for org in media_people.keys():
    values = [x['interval'] for x in media_people[org].values() if x['interval'] < 365.0]
    threshhold = 7.0
    print "{0}: {1}/{2} bylines posting with a >{3} interval".format(org,len([x for x in values if x>threshhold]),len(values),threshhold)
    print "{0}: {2} mean interval. {1} median interval".format(org, np.mean(values),np.median(values))
    plt.hist(values, max(values))
    plt.xlabel("Posting intervals in Published in {0}".format(org))
    plt.ylabel('Number of Authors', fontsize= 20)
    plt.show()









    



washington post: 32/72 bylines posting with a >7.0 interval
washington post: 6.29571843251 mean interval. 21.201597476 median interval






    












    



new york post: 23/35 bylines posting with a >7.0 interval
new york post: 8.8 mean interval. 17.530468911 median interval






    












    



new york times: 177/216 bylines posting with a >7.0 interval
new york times: 44.0714285714 mean interval. 76.0449956733 median interval






    












    



wall street journal: 92/100 bylines posting with a >7.0 interval
wall street journal: 66.5 mean interval. 78.5220184262 median interval






    












    



la times: 170/294 bylines posting with a >7.0 interval
la times: 9.66666666667 mean interval. 29.3318783528 median interval



In [24]:

    
for key in sort(media_people.keys()):
    print "{0}: {1} bylines".format(key,len(media_people[key]))
    values = [x['count'] for x in media_people[key].values()]
    plt.hist(values, max(values))
    plt.xlabel("Articles Published in {0}".format(key))
    plt.ylabel('Number of Authors', fontsize= 20)
    plt.show()









    



la times: 918 bylines






    












    



new york post: 116 bylines






    












    



new york times: 1445 bylines






    












    



wall street journal: 559 bylines






    












    



washington post: 192 bylines



In [8]:

    
#OUTPUT TO BYLINE FILE
#mp = {}
#for org in media_people.keys():
#    mp[org] = {}
#    for name in media_people[org].keys():
#        if(op_by_freq(media_people[org][name])):
#            mp[org][name] = media_people[org][name]['count']
#
#f = open('interval_freq_org_people_upload.csv', 'w')
#b.export_org_names(mp,f)
#f.close()



In [32]:

    
from byline_gender import BylineGender
b = BylineGender()
b.load_name_org_online()
unknown = []
known = []

for org in media_people.keys():
    print "{0}: {1} bylines".format(org,len(media_people[org]))
    vals = {"female":{},"male":{},"unknown":{}}
    for name in media_people[org].keys():
        #if(not op_by_freq(media_people[org][name])):
        #    continue
        #gender = b.single_name_gender(name)
        gender = b.org_name_gender(org,name)
        if(not gender in ["ignore"]):
            vals[gender][name]=media_people[org][name]['count']
            if gender is "unknown":
                unknown.append(name)
            else:
                known.append(name)    
    m = 0
    for v in vals.values():
        if(len(v) > 0 and max(v)>m):
            m = max(v)
    
    h = []
    labels = []
    for v in sort(vals.keys()):
        labels.append(v)
        h.append(vals[v].values())
        if(len(h[-1]) == 0):
            h[-1]=[0]
    plt.figure()    
    n,bins,patches = plt.hist(h)
    plt.xlabel("Articles Published in {0}".format(org))
    plt.ylabel('Number of Authors', fontsize= 20)
    legend(patches, labels)
    plt.show()

print "UNKNOWN BYLINES: {0}".format(len(unknown))
print "GUESSED BYLINES: {0}".format(len(known))









    



washington post: 192 bylines






    












    



new york post: 116 bylines






    












    



new york times: 1445 bylines






    












    



wall street journal: 559 bylines






    












    



la times: 918 bylines






    












    



UNKNOWN BYLINES: 521
GUESSED BYLINES: 2702



In [26]:

    
def pct(a,b):
    return 100*(float(a)/float(b))

#['huffington post','salon'
for org in media_people.keys():
    print "{0}: {1} bylines".format(org,len(media_people[org]))
    article_count = {"female":0,"male":0,"unknown":0}
    people_count = {"female":0,"male":0,"unknown":0}
    
    total = 0
    people_total = 0
    for name in media_people[org].keys():
        if(not op_by_freq(media_people[org][name])):
            continue        
        gender = b.org_name_gender(org,name)
        if(not gender in ["ignore"]):
            article_count[gender]+= media_people[org][name]['count']
            people_count[gender] += 1
            people_total += 1
            total += media_people[org][name]['count'] 

    #ARTICLE COUNT CHART
    colors= '#78E678','#E8CA53',"#CCCCCC"

    P.figure(1, figsize=(6,6))
    labels = 'female', 'male', 'unknown'
    fracs = [pct(article_count['female'],total), pct(article_count['male'],total), pct(article_count['unknown'],total)]
    explode=(0.06, 0, 0)
    P.pie(fracs, explode=explode, colors=colors, labels=labels,
                    autopct='%1.1f%%')
    P.title('Author Gender per Article in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
    P.show()
    
    #PEOPLE COUNT CHART
    P.figure(1, figsize=(6,6))
    labels = 'female', 'male', 'unknown'
    fracs = [pct(people_count['female'],people_total), pct(people_count['male'],people_total), pct(people_count['unknown'],people_total)]
    explode=(0.06, 0, 0)
    P.pie(fracs, explode=explode, colors=colors, labels=labels,
                    autopct='%1.1f%%')
    P.title('Unique Author Gender in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
    P.show()









    



washington post: 192 bylines






    












    












    



new york post: 116 bylines






    












    












    



new york times: 1445 bylines






    












    












    



wall street journal: 559 bylines






    












    












    



la times: 918 bylines



In [31]:

    
def pct(a,b):
    return 100*(float(a)/float(b))


article_pcts = {}
ppl_pcts = {}
for org in media_people.keys():
    article_pcts[org]={"male":[],"female":[],"unknown":[]}
    ppl_pcts[org]={"male":[],"female":[],"unknown":[]}


#['huffington post','salon'
for org in media_people.keys():
    print "{0}: {1} bylines".format(org,len(media_people[org]))
    article_count = {"female":0,"male":0,"unknown":0}
    people_count = {"female":0,"male":0,"unknown":0}
    
    total = 0
    people_total = 0
    for name in media_people[org].keys():
        if(not op_by_freq(media_people[org][name])):
            continue        
        gender = b.org_name_gender(org,name)
        if(not gender in ["ignore"]):
            article_count[gender]+= media_people[org][name]['count']
            people_count[gender] += 1
            people_total += 1
            total += media_people[org][name]['count'] 

    #ARTICLE COUNT CHART
    colors= '#78E678','#E8CA53',"#CCCCCC"

    labels = 'female', 'male', 'unknown'
    for label in labels:
        article_pcts[org][label] = pct(article_count[label],total)
        ppl_pcts[org][label] = pct(people_count[label],people_total)









    



washington post: 192 bylines
new york post: 116 bylines
new york times: 1445 bylines
wall street journal: 559 bylines
la times: 918 bylines



In [ ]:

    
print article_pcts.keys()#['washington post']
#print ppl_pcts['washington post']



In [38]:

    
ind = [0,1]
width = 0.55

for org in media_people.keys():

    bottom_one = [article_pcts[org]['female'], ppl_pcts[org]['female']]

    bottom_two = [article_pcts[org]['male'] + article_pcts[org]['female'],
                     ppl_pcts[org]['male'] + ppl_pcts[org]['female']]
    
    female = [article_pcts[org]['female'],ppl_pcts[org]['female']]
    male = [article_pcts[org]['male'],ppl_pcts[org]['male']]
    unknown = [article_pcts[org]['unknown'],ppl_pcts[org]['unknown']]

    fig = plt.figure()   
    fig.set_size_inches(3,5)
    ax = fig.add_subplot(111)
    p1 = ax.bar(ind, female,   width, color="#78E678")
    p2 = ax.bar(ind, male,  width, bottom=bottom_one, color='#E8CA53')
    p3 = ax.bar(ind, unknown, width,bottom=bottom_two, color='#cccccc')
    plt.ylabel("Byline ratios by article")
    plt.title("{0} across {1} authors and {2} articles".format(org,len(media_people[org]), sum([media_people[org][x]['count'] for x in media_people[org].keys()])))

    #x = plt.xticks(ind)#, media_people.keys())
    ax.set_xticks(ind)
    ax.set_xticklabels(["article bylines", "contributors"], rotation=45, fontsize=10,ha='center')
    plt.legend([p1, p2,p3], ["% female","% male","% unknown"],
               bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.show()



In [1]:

    
ind = range(0,len(ppl_pcts["female"]))
width = 0.35

unknownbottom = []
    
for i in range(0,len(article_pcts['male'])):
    unknownbottom.append(article_pcts['male'][i]+article_pcts['female'][i])

fig = plt.figure()   
ax = fig.add_subplot(111)
p1 = ax.bar(ind, article_pcts['female'],   width, color="#78E678")
p2 = ax.bar(ind, article_pcts['male'],  width, bottom=article_pcts['female'], color='#E8CA53')
p3 = ax.bar(ind, article_pcts['unknown'], width,bottom=unknownbottom, color='#cccccc')
plt.ylabel("Byline ratios by article")
plt.title('Gender ratio in infrequent bylines')

#x = plt.xticks(ind)#, media_people.keys())
ax.set_xticks(ind)
ax.set_xticklabels(media_people.keys(), rotation=45, fontsize=10,ha='right')
plt.legend([p1, p2,p3], ["female","male","unknown"],
           bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.show()









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-7ef63ea09cc6> in <module>()
----> 1 ind = range(0,len(ppl_pcts["female"]))
      2 width = 0.35
      3 
      4 unknownbottom = []
      5 

NameError: name 'ppl_pcts' is not defined



In [ ]: