In [1]:
from nytimesarticle import articleAPI
import ner
import os
import re
import csv
from urllib import urlopen
import sys
csv.field_size_limit(sys.maxsize)
Out[1]:
In [2]:
api = articleAPI('e15cea455f73cc47d6d971667e09c31c:19:44644296')
In [3]:
def parse_articles(articles):
'''
This function takes in a response to the NYT api and parses
the articles into a list of dictionaries
'''
news = []
for i in articles['response']['docs']:
dic = {}
dic['id'] = i['_id']
if i['abstract'] is not None:
dic['abstract'] = i['abstract'].encode("utf8")
dic['headline'] = i['headline']['main'].encode("utf8")
dic['desk'] = i['news_desk']
dic['date'] = i['pub_date'][0:10] # cutting time of day.
dic['section'] = i['section_name']
if i['snippet'] is not None:
dic['snippet'] = i['snippet'].encode("utf8")
dic['source'] = i['source']
dic['type'] = i['type_of_material']
dic['url'] = i['web_url']
dic['word_count'] = i['word_count']
# locations
locations = []
for x in range(0,len(i['keywords'])):
if 'glocations' in i['keywords'][x]['name']:
locations.append(i['keywords'][x]['value'])
dic['locations'] = locations
# subject
subjects = []
for x in range(0,len(i['keywords'])):
if 'subject' in i['keywords'][x]['name']:
subjects.append(i['keywords'][x]['value'])
dic['subjects'] = subjects
news.append(dic)
return(news)
In [7]:
def get_articles(date):
'''
This function accepts a year in string format (e.g.'1980')
and will return a list of parsed articles about women
(in dictionaries) for that year.
'''
all_articles = []
for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
articles = api.search(
fq = {'source':['Reuters','AP', 'The New York Times'],'subject.contains':'women'},
begin_date = date + '0101',
end_date = date + '1231',
sort='oldest',
page = str(i))
articles = parse_articles(articles)
all_articles = all_articles + articles
return(all_articles)
In [16]:
women_all = []
for i in range(1980,2015):
print 'Processing' + str(i) + '...'
women_year = get_articles(str(i))
women_all = women_all + women_year
In [29]:
women_all[2]
Out[29]:
In [26]:
for i in women_all[:100]:
x = i['locations']
for s in x:
if s != "UNITED STATES":
print s
In [ ]: