In [ ]:
#TODO: words written by men and women
import csv
import numpy as np
import matplotlib.pyplot as plt
import pylab as P
import os
lines = []
#CHOOSE A FOLDER FROM ABOVE
#folder = "./test_results/30bf9fd5bb824eb49e89c8a828276348c0b1570c/"
folder = "./annual_data/"
files = ["[2013-08-01T00:00:00Z TO 2013-09-01T00:00:00Z].csv",
"[2013-09-01T00:00:00Z TO 2013-10-01T00:00:00Z].csv",
"[2013-10-01T00:00:00Z TO 2013-11-01T00:00:00Z].csv",
"[2013-11-01T00:00:00Z TO 2013-12-01T00:00:00Z].csv",
"[2013-12-01T00:00:00Z TO 2014-01-01T00:00:00Z].csv",
"[2014-01-01T00:00:00Z TO 2014-02-01T00:00:00Z].csv",
"[2014-02-01T00:00:00Z TO 2014-03-01T00:00:00Z].csv",
"[2014-03-01T00:00:00Z TO 2014-04-01T00:00:00Z].csv",
"[2014-04-01T00:00:00Z TO 2014-05-01T00:00:00Z].csv",
"[2014-05-01T00:00:00Z TO 2014-06-01T00:00:00Z].csv",
"[2014-06-01T00:00:00Z TO 2014-07-01T00:00:00Z].csv",
"[2014-07-01T00:00:00Z TO 2014-08-01T00:00:00Z].csv"]
top = 0
for filename in files:
#with open (os.path.join(folder,"month_06_2014.csv")) as f:
with open (os.path.join(folder,files[0])) as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
lines.append(row)
lines.pop(top)
top = len(lines)
lines[0].keys()
In [6]:
import requests
import json
# reduces social media metrics to a single number
# highly reductionist, as you might expect
class SocialMedia:
def facebook(self, url):
#res = requests.get("http://graph.facebook.com/" + url)
res = requests.get("https://graph.facebook.com/fql?q=SELECT%20like_count,%20total_count,%20share_count,%20click_count,%20comment_count%20FROM%20link_stat%20WHERE%20url%20=%20%22{0}%22".format(url.replace("http://","")))
j = json.loads(res.text)
if 'data' in j.keys() and len(j['data'])>0:
return j['data'][0]['total_count']
return None
def twitter(self, url):
res = requests.get("http://urls.api.twitter.com/1/urls/count.json?url=" + url)
j = json.loads(res.text)
if 'count' in j.keys():
return j['count']
return None
def reddit(self, url):
reddit_url = "http://buttons.reddit.com/button_info.json?url={0}".format(url)
res = requests.get(reddit_url)
#import pdb; pdb.set_trace()
j = json.loads(res.text)
if not "data" in j:
print "REDDIT ERROR WITH {0}".format(reddit_url)
return None
#return {"ups":"0", "num_comments":"0"}
else:
data = j['data']
if "children" in data and len(data["children"]) > 0 and "data" in data["children"][0]:
child = data["children"][0]
return child['data']['ups'] + child['data']['num_comments']
#return {"ups":child["data"]["ups"],"num_comments":child["data"]["num_comments"]}
#return {"ups":"0", "num_comments":"0"}
return None
sm = SocialMedia()
print "Facebook: {0}".format(sm.facebook("http://civic.mit.edu"))
print "Twitter: {0}".format(sm.twitter("http://civic.mit.edu"))
print "Reddit: {0}".format(sm.reddit("http://civic.mit.edu"))
In [7]:
#SUMMARIZE SECTION IDENTIFICATION
MEDIA= {
'1': "new york times",
'2': "washington post",
'6':"la times",
'7': "new york post",
'1150': "wall street journal",
'1757': "salon",
'1707': "daily beast",
'1750': "telegraph",
'314' : "huffington post",
"27502":"huffington post" #assuming these are the same for now
}
media = {}
for line in lines:
mediakey = MEDIA[line['media_id']]
section = line['section']
if(not mediakey in media):
media[mediakey] = {}
if(not section in media[mediakey]):
media[mediakey][section] = 0
media[mediakey][section] += 1
for key in media.keys():
articles = 0
for section in media[key].keys():
articles += media[key][section]
print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
for section in media[key].keys():
if(section.lower().find("opinion")>=0):
print " {0}: {1}".format(section,media[key][section])
In [1]:
# GROUP BYLINES BY MEDIA ORGANISATION
# AND SUMMARIZE
sm = SocialMedia()
media_people = {}
mpop = {}
from byline_gender import BylineGender
import time
b = BylineGender()
#for key in media.keys():
# articles = 0
# for section in media[key].keys():
# articles += media[key][section]
# print "{0}: {1} sections, {2} articles".format(key,len(media[key]),articles)
# for section in media[key].keys():
# if(section.lower().find("opinion")>=0):
# print " {0}: {1}".format(section,media[key][section])
sections = []
for line in lines:
if(line['section'].lower().find("opinion")>=0):
section = line['section']
mediakey = MEDIA[line['media_id']]
byline_text = line['byline']
#just for our social media analysis:
if not mediakey is "la times":
continue
social_media = [sm.facebook(line['url']),sm.twitter(line['url'])]
smcount = sum([y for y in social_media if not y is None])
sys.stdout.write(smcount+".")
time.sleep(0.25)
if not section in sections:
sections.append(section)
for byline in b.get_full_names(byline_text):
if(not mediakey in media_people):
media_people[mediakey] = {}
mpop[mediakey] = {}
if(not byline in media_people[mediakey]):
media_people[mediakey][byline] = 0
mpop[mediakey][byline] = 0
media_people[mediakey][byline] += 1
mpop[mediakey][byline] += smcount
print "---"
print sections
for key in media_people.keys():
print "{0}: {1} bylines".format(key,len(media_people[key]))
In [ ]:
for key in sort(media_people.keys()):
print "{0}: {1} bylines".format(key,len(media_people[key]))
values = media_people[key].values()
plt.hist(values, max(values))
plt.xlabel("Articles Published in {0}".format(key))
plt.ylabel('Number of Authors', fontsize= 20)
plt.show()
In [ ]:
from byline_gender import BylineGender
b = BylineGender()
b.load_name_org_online()
unknown = []
known = []
for org in sort(media_people.keys()):
print "{0}: {1} bylines".format(org,len(media_people[org]))
vals = {"female":{},"male":{},"unknown":{}}
for name in media_people[org].keys():
#gender = b.single_name_gender(name)
gender = b.org_name_gender(org,name)
if(not gender in ["ignore"]):
vals[gender][name]=media_people[org][name]
if gender is "unknown":
unknown.append(name)
else:
known.append(name)
m = 0
for v in vals.values():
if(len(v) > 0 and max(v)>m):
m = max(v)
h = []
labels = []
for v in sort(vals.keys()):
labels.append(v)
h.append(vals[v].values())
if(len(h[-1]) == 0):
h[-1]=[0]
plt.figure()
n,bins,patches = plt.hist(h)
plt.xlabel("Articles Published in {0}".format(key))
plt.ylabel('Number of Authors', fontsize= 20)
legend(patches, labels)
plt.show()
print "UNKNOWN BYLINES: {0}".format(len(unknown))
print "GUESSED BYLINES: {0}".format(len(known))
In [ ]:
#OUTPUT TO BYLINE FILE
#f = open('org_people_upload.csv', 'w')
#for org in sort(media_people.keys()):
# for name in media_people[org].keys():
# f.write(','.join([org.replace(" ","+"),name.replace(" ","+"),b.org_name_gender(org,name),str(media_people[org][name])])+ "\n")
#f.close()
In [ ]:
def pct(a,b):
return 100*(float(a)/float(b))
for org in sort(media_people.keys()):
print "{0}: {1} bylines".format(org,len(media_people[org]))
article_count = {"female":0,"male":0,"unknown":0}
people_count = {"female":0,"male":0,"unknown":0}
total = 0
people_total = 0
for name in media_people[org].keys():
#gender = b.single_name_gender(name)
gender = b.org_name_gender(org,name)
if(not gender in ["ignore"]):
article_count[gender]+= media_people[org][name]
people_count[gender] += 1
people_total += 1
total += media_people[org][name]
#ARTICLE COUNT CHART
P.figure(1, figsize=(6,6))
labels = 'female', 'male', 'unknown'
fracs = [pct(article_count['female'],total), pct(article_count['male'],total), pct(article_count['unknown'],total)]
explode=(0.06, 0, 0)
P.pie(fracs, explode=explode, labels=labels,
autopct='%1.1f%%', shadow=True)
P.title('Author Gender per Article in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
P.show()
#PEOPLE COUNT CHART
P.figure(1, figsize=(6,6))
labels = 'female', 'male', 'unknown'
fracs = [pct(people_count['female'],people_total), pct(people_count['male'],people_total), pct(people_count['unknown'],people_total)]
explode=(0.06, 0, 0)
P.pie(fracs, explode=explode, labels=labels,
autopct='%1.1f%%', shadow=True)
P.title('Unique Author Gender in {0} across {1} authors and {2} articles'.format(org,len(media_people[org]), total), bbox={'facecolor':'0.8', 'pad':5})
P.show()
In [ ]: