In [17]:
#TODO: words written by men and women

import csv
import numpy as np
import matplotlib.pyplot as plt
import pylab as P
import os

csv.field_size_limit(sys.maxsize)
lines = []

#CHOOSE A FOLDER FROM ABOVE
#folder = "./test_results/30bf9fd5bb824eb49e89c8a828276348c0b1570c/"
folder = "./annual_data/"

files = ["[2013-08-01T00:00:00Z TO 2013-09-01T00:00:00Z].csv",
"[2013-09-01T00:00:00Z TO 2013-10-01T00:00:00Z].csv",
"[2013-10-01T00:00:00Z TO 2013-11-01T00:00:00Z].csv",
"[2013-11-01T00:00:00Z TO 2013-12-01T00:00:00Z].csv",
"[2013-12-01T00:00:00Z TO 2014-01-01T00:00:00Z].csv",
"[2014-01-01T00:00:00Z TO 2014-02-01T00:00:00Z].csv",
"[2014-02-01T00:00:00Z TO 2014-03-01T00:00:00Z].csv",
"[2014-03-01T00:00:00Z TO 2014-04-01T00:00:00Z].csv",
"[2014-04-01T00:00:00Z TO 2014-05-01T00:00:00Z].csv",
"[2014-05-01T00:00:00Z TO 2014-06-01T00:00:00Z].csv",
"[2014-06-01T00:00:00Z TO 2014-07-01T00:00:00Z].csv",
"[2014-07-01T00:00:00Z TO 2014-08-01T00:00:00Z].csv"]

top = 0
for filename in files:
    #with open (os.path.join(folder,"month_06_2014.csv")) as f:
    with open (os.path.join(folder,filename)) as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            lines.append(row)
    lines.pop(top)
    top = len(lines)

lines[0].keys()
len(lines)


Out[17]:
430135

In [18]:
import requests
import json

# reduces social media metrics to a single number
# highly reductionist, as you might expect
class SocialMedia:
  def facebook(self, url):
    #res = requests.get("http://graph.facebook.com/" + url)
    res = requests.get("https://graph.facebook.com/fql?q=SELECT%20like_count,%20total_count,%20share_count,%20click_count,%20comment_count%20FROM%20link_stat%20WHERE%20url%20=%20%22{0}%22".format(url.replace("http://","")))
    j = json.loads(res.text)
    if 'data' in j.keys() and len(j['data'])>0:
        return j['data'][0]['total_count']
    return None

  def twitter(self, url):
    res = requests.get("http://urls.api.twitter.com/1/urls/count.json?url=" + url)
    j = json.loads(res.text)
    if 'count' in j.keys():
        return j['count']
    return None

  def reddit(self, url):
    reddit_url = "http://buttons.reddit.com/button_info.json?url={0}".format(url)
    res = requests.get(reddit_url)
    #import pdb; pdb.set_trace()
    j = json.loads(res.text)
    if not "data" in j:
      print "REDDIT ERROR WITH {0}".format(reddit_url)
      return None
      #return {"ups":"0", "num_comments":"0"}
    else:
      data = j['data']
    if "children" in data and len(data["children"]) > 0 and "data" in data["children"][0]:
      child = data["children"][0]
      return child['data']['ups'] + child['data']['num_comments']
      #return {"ups":child["data"]["ups"],"num_comments":child["data"]["num_comments"]}
    #return {"ups":"0", "num_comments":"0"}
    return None

In [19]:
#SUMMARIZE SECTION IDENTIFICATION
lines[0].keys()

MEDIA= {
  '1': "new york times",
  '2': "washington post",
  '6':"la times",
  '7': "new york post",
  '1150': "wall street journal",
  '1757': "salon",
  '1707': "daily beast",
  '1750': "telegraph",
  '314' : "huffington post",
"27502":"huffington post" #assuming these are the same for now
}

media = {}

for line in lines:
    mediakey = MEDIA[line['media_id']]
    section = line['section']
    if(not mediakey in media):
        media[mediakey] = {}
    if(not section in media[mediakey]):
        media[mediakey][section] = 0
    media[mediakey][section] += 1
        
        
for key in media.keys():
    articles = 0
    for section in media[key].keys():
        articles += media[key][section]
    print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
    
    #for section in media[key].keys():
    #    if(not section is None):
    #        if(section.lower().find("opinion")>=0):
    #            print "    {0}: {1}".format(section,media[key][section])


salon: 1 sections, 4856 articles
huffington post: 2 sections, 152765 articles
washington post: 918 sections, 12135 articles
la times: 67 sections, 51766 articles
new york post: 18 sections, 46237 articles
new york times: 129 sections, 89820 articles
wall street journal: 41 sections, 72556 articles

In [33]:
# GROUP BYLINES BY MEDIA ORGANISATION
# AND SUMMARIZE
media_people = {}

from byline_gender import BylineGender
b = BylineGender()
        
#for key in media.keys():
#    articles = 0
#    for section in media[key].keys():
#        articles += media[key][section]
#    print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
#    for section in media[key].keys():
#        if(section.lower().find("opinion")>=0):
#            print "    {0}: {1}".format(section,media[key][section])

sections = []
for line in lines:
    mediakey = MEDIA[line['media_id']]
    byline_text = line['byline']
    if(not line['section'] is None and line['section'].lower().find("opinion")>=0):
        section = line['section']
        if not section in sections:
            sections.append(section)
        for byline in b.get_full_names(byline_text):
            if(not mediakey in media_people):
                media_people[mediakey] = {}
            if(not byline in media_people[mediakey]):
                media_people[mediakey][byline] = 0
            media_people[mediakey][byline] += 1
            
#print "---"
#print sections
#for key in media_people.keys():
#    print "{0}: {1} bylines".format(key,len(media_people[key]))

In [21]:
for key in sort(media_people.keys()):
    print "{0}: {1} bylines".format(key,len(media_people[key]))
    values = media_people[key].values()
    plt.hist(values, max(values))
    plt.xlabel("Articles Published in {0}".format(key))
    plt.ylabel('Number of Authors', fontsize= 20)
    plt.show()


la times: 918 bylines
new york post: 116 bylines
new york times: 1445 bylines
wall street journal: 559 bylines
washington post: 192 bylines

In [22]:
from byline_gender import BylineGender
b = BylineGender()
b.load_name_org_online()
print b.org_name_gender("washington post","editorial board")
unknown = []
known = []

for org in sort(media_people.keys()):
    print "{0}: {1} bylines".format(org,len(media_people[org]))
    vals = {"female":{},"male":{},"unknown":{}}
    for name in media_people[org].keys():
        #gender = b.single_name_gender(name)
        gender = b.org_name_gender(org,name)
        if(not gender == "ignore"):
            vals[gender][name]=media_people[org][name]
            if gender == "unknown":
                unknown.append(name)
            else:
                known.append(name)    
    m = 0
    for v in vals.values():
        if(len(v) > 0 and max(v)>m):
            m = max(v)
    
    h = []
    labels = []
    for v in sort(vals.keys()):
        labels.append(v)
        h.append(vals[v].values())
        if(len(h[-1]) == 0):
            h[-1]=[0]
    plt.figure()    
    n,bins,patches = plt.hist(h)
    plt.xlabel("Articles Published in {0}".format(key))
    plt.ylabel('Number of Authors', fontsize= 20)
    legend(patches, labels)
    plt.show()

print "UNKNOWN BYLINES: {0}".format(len(unknown))
print "GUESSED BYLINES: {0}".format(len(known))


ignore
la times: 918 bylines
new york post: 116 bylines
new york times: 1445 bylines
wall street journal: 559 bylines
washington post: 192 bylines
UNKNOWN BYLINES: 522
GUESSED BYLINES: 2701

In [23]:
#OUTPUT TO BYLINE FILE

#f = open('org_people_upload.csv', 'w')
#b.export_org_names(media_people,f)
#f.close()

In [32]:
def pct(a,b):
    return 100*(float(a)/float(b))

pct_table = []
pct_table.append(["org","type","female","male","unknown"])

for org in sort(media_people.keys()):
    print "{0}: {1} bylines".format(org,len(media_people[org]))
    article_count = {"female":0,"male":0,"unknown":0}
    people_count = {"female":0,"male":0,"unknown":0}
    
    total = 0
    nontotal = 0
    people_total = 0
    for name in media_people[org].keys():
        #gender = b.single_name_gender(name)
        gender = b.org_name_gender(org,name)
        if(not gender == "ignore"):
            article_count[gender]+= media_people[org][name]
            people_count[gender] += 1
            people_total += 1
            total += media_people[org][name] 
        else:
            nontotal += media_people[org][name]

    #ARTICLE COUNT CHART
    colors= '#78E678','#E8CA53',"#CCCCCC"
    #print "NONTOTAL: {0}".format(nontotal)
    pct_table.append(["people",org,people_count['female'],people_count['male'],people_count['unknown']])
    pct_table.append(["article",org,article_count['female'],article_count['male'],article_count['unknown']])
    #print "{0},{1},{2},{3},{4}".format("people",org,people_count['female'],people_count['male'],people_count['unknown'])
    #print "{0},{1},{2},{3},{4}".format("article",org,article_count['female'],article_count['male'],article_count['unknown'])
    
    P.figure(1, figsize=(6,6))
    labels = 'female', 'male', 'unknown'
    fracs = [pct(article_count['female'],total), pct(article_count['male'],total), pct(article_count['unknown'],total)]
    explode=(0.06, 0, 0)
    P.pie(fracs, explode=explode, colors=colors, labels=labels,
                    autopct='%1.1f%%')
    P.title('Author Gender per Article in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
    P.show()
    
    #PEOPLE COUNT CHART
    P.figure(1, figsize=(6,6))
    labels = 'female', 'male', 'unknown'
    fracs = [pct(people_count['female'],people_total), pct(people_count['male'],people_total), pct(people_count['unknown'],people_total)]
    explode=(0.06, 0, 0)
    P.pie(fracs, explode=explode, colors=colors, labels=labels,
                    autopct='%1.1f%%')
    P.title('Unique Author Gender in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
    P.show()
    
for l in pct_table:
    print l


la times: 918 bylines
new york post: 116 bylines
new york times: 1445 bylines
wall street journal: 559 bylines
washington post: 192 bylines
['org', 'type', 'female', 'male', 'unknown']
['people', 'la times', 226, 564, 124]
['article', 'la times', 808, 1931, 281]
['people', 'new york post', 17, 83, 15]
['article', 'new york post', 77, 224, 30]
['people', 'new york times', 329, 841, 274]
['article', 'new york times', 876, 2158, 468]
['people', 'wall street journal', 73, 408, 78]
['article', 'wall street journal', 200, 994, 136]
['people', 'washington post', 46, 114, 31]
['article', 'washington post', 587, 2022, 61]

In [25]:
b.org_name_gender("washington post","editorial board")


Out[25]:
'ignore'

In [25]: