notebook.community

Edit and run



In [47]:

    
#TODO: words written by men and women

import csv
import numpy as np
import matplotlib.pyplot as plt
import pylab as P
import os
import cjson
import codecs
import pandas as pd

csv.field_size_limit(sys.maxsize)
lines = []

#CHOOSE A FOLDER FROM ABOVE
#folder = "./test_results/30bf9fd5bb824eb49e89c8a828276348c0b1570c/"
articles_folder = "./annual_data/"

files = ["[2013-08-01T00:00:00Z TO 2013-09-01T00:00:00Z].csv",
"[2013-09-01T00:00:00Z TO 2013-10-01T00:00:00Z].csv",
"[2013-10-01T00:00:00Z TO 2013-11-01T00:00:00Z].csv",
"[2013-11-01T00:00:00Z TO 2013-12-01T00:00:00Z].csv",
"[2013-12-01T00:00:00Z TO 2014-01-01T00:00:00Z].csv",
"[2014-01-01T00:00:00Z TO 2014-02-01T00:00:00Z].csv",
"[2014-02-01T00:00:00Z TO 2014-03-01T00:00:00Z].csv",
"[2014-03-01T00:00:00Z TO 2014-04-01T00:00:00Z].csv",
"[2014-04-01T00:00:00Z TO 2014-05-01T00:00:00Z].csv",
"[2014-05-01T00:00:00Z TO 2014-06-01T00:00:00Z].csv",
"[2014-06-01T00:00:00Z TO 2014-07-01T00:00:00Z].csv",
"[2014-07-01T00:00:00Z TO 2014-08-01T00:00:00Z].csv"]

topic_folders = [
"[2013-08-01T00:00:00Z TO 2013-09-01T00:00:00Z]",
"[2013-09-01T00:00:00Z TO 2013-10-01T00:00:00Z]",
"[2013-10-01T00:00:00Z TO 2013-11-01T00:00:00Z]",
"[2013-11-01T00:00:00Z TO 2013-12-01T00:00:00Z]",
"[2013-12-01T00%3A00%3A00Z TO 2014-01-01T00%3A00%3A00Z]",
"[2014-01-01T00:00:00Z TO 2014-02-01T00:00:00Z]",
"[2014-02-01T00%3A00%3A00Z TO 2014-03-01T00%3A00%3A00Z]",
"[2014-03-01T00:00:00Z TO 2014-04-01T00:00:00Z]",
"[2014-04-01T00:00:00Z TO 2014-05-01T00:00:00Z]",
"[2014-05-01T00:00:00Z TO 2014-06-01T00:00:00Z]",
"[2014-06-01T00:00:00Z TO 2014-07-01T00:00:00Z]",
"2014-07-01T00%3A00%3A00Z TO 2014-08-01T00%3A00%3A00Z"]

#top = 0
for index in range(0, len(files)):
    # open up the articles CSV file for a given month
    with open (os.path.join(articles_folder,files[index])) as f:
        reader = csv.DictReader(f)
        
        #find the corresponding month's opinion topic data
        for file in os.listdir(os.path.join("./classification_results",topic_folders[index])):
            if fnmatch.fnmatch(file, 'opeds*JSON'):
                fd = codecs.open(os.path.join("./classification_results", topic_folders[index], file), 'r', encoding='utf8').read()
                topic_data = cjson.decode(fd)
        for i, row in enumerate(reader):
            #print i
            #print row['extractor.full_text'][0:20]
            #print topic_data['topics'][i]['txt'][0:20]
            #print "---"
            row['topic']= topic_data['topics'][i]['classified_as'] #ignore header row in article CSV TODO: improve later
            lines.append(row)
    #pop the key row and set up the next top pop
    #lines.pop(top)
    #rows = len(lines) - top + 1
    #top = len(lines)
    
    #check topic_data and lines to see if merge is feasible
    print( "articles: {0}, topic_data: {1}".format(rows,len(topic_data['topics'])))

lines[0].keys()
len(lines)









    



articles: 38977, topic_data: 33391
articles: 38977, topic_data: 29711
articles: 38977, topic_data: 32772
articles: 38977, topic_data: 34858
articles: 38977, topic_data: 34066
articles: 38977, topic_data: 39009
articles: 38977, topic_data: 33294
articles: 38977, topic_data: 37361
articles: 38977, topic_data: 38809
articles: 38977, topic_data: 41253
articles: 38977, topic_data: 36646
articles: 38977, topic_data: 38977






    Out[47]:





430147



In [48]:

    
# Previous code

#data = codecs.open('./Data/topic_oped_classification_results-sands_copy/2014-01-01_TO_2014-02-01/topics_file_2014-08-18_11_02_38.JSON', 'r', encoding='utf8').read()
#data = cjson.decode(data)
#topics = data['topics']
#print('Found %d topics in file.' % (len(topics)))



In [49]:

    
#import fnmatch
#for index in range(0,len(files)):
#    for file in os.listdir(os.path.join("classification_results",topic_folders[index])):
#        if fnmatch.fnmatch(file, 'opeds*JSON'):
#            f = codecs.open(os.path.join("classification_results", topic_folders[index], file), 'r', encoding='utf8').read()
#            data = cjson.decode(f)
#            topics = data['topics']
#            print('Found %d topics in file.' % (len(topics)))



In [49]:



In [50]:

    
#SUMMARIZE SECTION IDENTIFICATION
lines[0].keys()

MEDIA= {
  '1': "new york times",
  '2': "washington post",
  '6':"la times",
  '7': "new york post",
  '1150': "wall street journal",
  '1757': "salon",
  '1707': "daily beast",
  '1750': "telegraph",
  '314' : "huffington post",
"27502":"huffington post" #assuming these are the same for now
}

media = {}

for line in lines:
    mediakey = MEDIA[line['media_id']]
    section = line['section']
    if(not mediakey in media):
        media[mediakey] = {}
    if(not section in media[mediakey]):
        media[mediakey][section] = 0
    media[mediakey][section] += 1
        
        
for key in media.keys():
    articles = 0
    for section in media[key].keys():
        articles += media[key][section]
    print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
    
    #for section in media[key].keys():
    #    if(not section is None):
    #        if(section.lower().find("opinion")>=0):
    #            print "    {0}: {1}".format(section,media[key][section])









    



salon: 1 sections, 4856 articles
huffington post: 2 sections, 152766 articles
washington post: 918 sections, 12135 articles
la times: 67 sections, 51770 articles
new york times: 129 sections, 89825 articles
new york post: 18 sections, 46238 articles
wall street journal: 41 sections, 72557 articles



In [51]:

    
# GROUP BYLINES BY MEDIA ORGANISATION
# AND SUMMARIZE
media_people = {}

from byline_gender import BylineGender
b = BylineGender()
        
#for key in media.keys():
#    articles = 0
#    for section in media[key].keys():
#        articles += media[key][section]
#    print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
#    for section in media[key].keys():
#        if(section.lower().find("opinion")>=0):
#            print "    {0}: {1}".format(section,media[key][section])

sections = []
for line in lines:
    mediakey = MEDIA[line['media_id']]
    byline_text = line['byline']
    # if it's an opinion article, add it to the dataset
    if(not line['topic'] is None and line['topic'] == "oped"):
        section = line['section']
        if not section in sections:
            sections.append(section)
        for byline in b.get_full_names(byline_text):
            if(not mediakey in media_people):
                media_people[mediakey] = {}
            if(not byline in media_people[mediakey]):
                media_people[mediakey][byline] = 0
            media_people[mediakey][byline] += 1
            
#print "---"
#print sections
#for key in media_people.keys():
#    print "{0}: {1} bylines".format(key,len(media_people[key]))



In [52]:

    
for key in sort(media_people.keys()):
    print "{0}: {1} bylines".format(key,len(media_people[key]))
    values = media_people[key].values()
    plt.hist(values, max(values))
    plt.xlabel("Articles Published in {0}".format(key))
    plt.ylabel('Number of Authors', fontsize= 20)
    plt.show()









    



huffington post: 12579 bylines






    












    



la times: 1101 bylines






    












    



new york post: 246 bylines






    












    



new york times: 2912 bylines






    












    



salon: 702 bylines






    












    



wall street journal: 1915 bylines






    












    



washington post: 562 bylines



In [53]:

    
from byline_gender import BylineGender
b = BylineGender()
b.load_name_org_online()
print b.org_name_gender("washington post","editorial board")
unknown = []
known = []

for org in sort(media_people.keys()):
    print "{0}: {1} bylines".format(org,len(media_people[org]))
    vals = {"female":{},"male":{},"unknown":{}}
    for name in media_people[org].keys():
        #gender = b.single_name_gender(name)
        gender = b.org_name_gender(org,name)
        if(not gender == "ignore"):
            vals[gender][name]=media_people[org][name]
            if gender == "unknown":
                unknown.append(name)
            else:
                known.append(name)    
    m = 0
    for v in vals.values():
        if(len(v) > 0 and max(v)>m):
            m = max(v)
    
    h = []
    labels = []
    for v in sort(vals.keys()):
        labels.append(v)
        h.append(vals[v].values())
        if(len(h[-1]) == 0):
            h[-1]=[0]
    plt.figure()    
    n,bins,patches = plt.hist(h)
    plt.xlabel("Articles Published in {0}".format(org))
    plt.ylabel('Number of Authors', fontsize= 20)
    legend(patches, labels)
    plt.show()

print "UNKNOWN BYLINES: {0}".format(len(unknown))
print "GUESSED BYLINES: {0}".format(len(known))









    



ignore
huffington post: 12579 bylines






    












    



la times: 1101 bylines






    












    



new york post: 246 bylines






    












    



new york times: 2912 bylines






    












    



salon: 702 bylines






    












    



wall street journal: 1915 bylines






    












    



washington post: 562 bylines






    












    



UNKNOWN BYLINES: 4831
GUESSED BYLINES: 15041



In [54]:

    
#OUTPUT TO BYLINE FILE

#f = open('org_people_upload.csv', 'w')
#b.export_org_names(media_people,f)
#f.close()



In [56]:

    
def pct(a,b):
    return 100*(float(a)/float(b))

pct_table = []
pct_table.append(["org","type","female","male","unknown"])

for org in sort(media_people.keys()):
    print "{0}: {1} bylines".format(org,len(media_people[org]))
    article_count = {"female":0,"male":0,"unknown":0}
    people_count = {"female":0,"male":0,"unknown":0}
    
    total = 0
    nontotal = 0
    people_total = 0
    for name in media_people[org].keys():
        #gender = b.single_name_gender(name)
        gender = b.org_name_gender(org,name)
        if(not gender == "ignore"):
            article_count[gender]+= media_people[org][name]
            people_count[gender] += 1
            people_total += 1
            total += media_people[org][name] 
        else:
            nontotal += media_people[org][name]

    #ARTICLE COUNT CHART
    colors= '#78E678','#E8CA53',"#CCCCCC"
    #print "NONTOTAL: {0}".format(nontotal)
    pct_table.append(["people",org,people_count['female'],people_count['male'],people_count['unknown']])
    pct_table.append(["article",org,article_count['female'],article_count['male'],article_count['unknown']])
    #print "{0},{1},{2},{3},{4}".format("people",org,people_count['female'],people_count['male'],people_count['unknown'])
    #print "{0},{1},{2},{3},{4}".format("article",org,article_count['female'],article_count['male'],article_count['unknown'])
    
    P.figure(1, figsize=(6,6))
    labels = 'female', 'male', 'unknown'
    fracs = [pct(article_count['female'],total), pct(article_count['male'],total), pct(article_count['unknown'],total)]
    explode=(0.06, 0, 0)
    P.pie(fracs, explode=explode, colors=colors, labels=labels,
                    autopct='%1.1f%%')
    P.title('Author Gender per Article in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
    P.show()
    
    #PEOPLE COUNT CHART
    P.figure(1, figsize=(6,6))
    labels = 'female', 'male', 'unknown'
    fracs = [pct(people_count['female'],people_total), pct(people_count['male'],people_total), pct(people_count['unknown'],people_total)]
    explode=(0.06, 0, 0)
    P.pie(fracs, explode=explode, colors=colors, labels=labels,
                    autopct='%1.1f%%')
    P.title('Unique Author Gender in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
    P.show()
    
for l in pct_table:
    print l









    



huffington post: 12579 bylines






    












    












    



la times: 1101 bylines






    












    












    



new york post: 246 bylines






    












    












    



new york times: 2912 bylines






    












    












    



salon: 702 bylines






    












    












    



wall street journal: 1915 bylines






    












    












    



washington post: 562 bylines






    












    












    



['org', 'type', 'female', 'male', 'unknown']
['people', 'huffington post', 4135, 4880, 3441]
['article', 'huffington post', 18221, 22683, 11621]
['people', 'la times', 289, 609, 194]
['article', 'la times', 2908, 5176, 1721]
['people', 'new york post', 62, 141, 42]
['article', 'new york post', 223, 555, 380]
['people', 'new york times', 841, 1557, 512]
['article', 'new york times', 4263, 13603, 4903]
['people', 'salon', 239, 302, 157]
['article', 'salon', 1375, 1510, 702]
['people', 'wall street journal', 448, 1081, 381]
['article', 'wall street journal', 3142, 7108, 2845]
['people', 'washington post', 163, 294, 104]
['article', 'washington post', 1841, 4432, 565]



In [ ]:

    
b.org_name_gender("washington post","editorial board")



In [57]:

    
accuracy = {}
for line in lines:
    
    mediakey = MEDIA[line['media_id']]
    if mediakey not in accuracy.keys():
        accuracy[mediakey]={"correct_included":0,"correct_excluded":0,
                            "incorrect_included":0,"incorrect_excluded":0}    
    section = line['section']
    is_oped = (not line['topic'] is None and line['topic'] == "oped")
    if(not section is None and section.lower().find("opinion")>=0):
        if(is_oped):
            accuracy[mediakey]['correct_included']+=1
        else:
            accuracy[mediakey]['incorrect_excluded']+=1
    else:
        if(is_oped):
            accuracy[mediakey]['incorrect_included']+=1
        else:
            accuracy[mediakey]['correct_excluded']+=1
    if ((accuracy[mediakey]['correct_included'] + accuracy[mediakey]['incorrect_included'] >0) and (accuracy[mediakey]['correct_included'] + accuracy[mediakey]['incorrect_excluded'] )):
        accuracy[mediakey]['precision'] = float(accuracy[mediakey]['correct_included']) / (float(accuracy[mediakey]['correct_included']) + float(accuracy[mediakey]['incorrect_included']))
        accuracy[mediakey]['recall'] = float(accuracy[mediakey]['correct_included']) / (float(accuracy[mediakey]['correct_included']) + float(accuracy[mediakey]['incorrect_excluded']))
    else:
        accuracy[mediakey]['precision'] = 0.
        accuracy[mediakey]['recall'] = 0.
print "publication,precision, recall, correct_included,correct_excluded,incorrect_included,incorrect_excluded"
for mediakey in accuracy.keys():
    print("{0},{1},{2},{3},{4},{5},{6}".format(mediakey,accuracy[mediakey]['precision'],accuracy[mediakey]['recall'],accuracy[mediakey]['correct_included'],accuracy[mediakey]['correct_excluded'],accuracy[mediakey]['incorrect_included'],accuracy[mediakey]['incorrect_excluded']))









    



publication,precision, recall, correct_included,correct_excluded,incorrect_included,incorrect_excluded
salon,0.0,0.0,0,1714,3142,0
huffington post,0.0,0.0,0,95933,56833,0
washington post,0.397125782584,0.73196957776,2791,4085,4237,1022
la times,0.200299593409,0.561993395377,1872,40965,7474,1459
new york times,0.217440686522,0.807151780137,5169,64818,18603,1235
new york post,0.302841918295,0.826466311197,1705,40250,3925,358
wall street journal,0.101736111111,0.75089697591,1465,57671,12935,486



In [ ]: