notebook.community

Edit and run



In [1]:

    
from nytimesarticle import articleAPI
import ner
import os
import re
import csv
from urllib import urlopen
import sys
csv.field_size_limit(sys.maxsize)









    Out[1]:





131072



In [2]:

    
api = articleAPI('e15cea455f73cc47d6d971667e09c31c:19:44644296')



In [3]:

    
def parse_articles(articles):
    '''
    This function takes in a response to the NYT api and parses
    the articles into a list of dictionaries
    '''
    news = []
    for i in articles['response']['docs']:
        dic = {}
        dic['id'] = i['_id']
        if i['abstract'] is not None:
            dic['abstract'] = i['abstract'].encode("utf8")
        dic['headline'] = i['headline']['main'].encode("utf8")
        dic['desk'] = i['news_desk']
        dic['date'] = i['pub_date'][0:10] # cutting time of day.
        dic['section'] = i['section_name']
        if i['snippet'] is not None:
            dic['snippet'] = i['snippet'].encode("utf8")
        dic['source'] = i['source']
        dic['type'] = i['type_of_material']
        dic['url'] = i['web_url']
        dic['word_count'] = i['word_count']
        # locations
        locations = []
        for x in range(0,len(i['keywords'])):
            if 'glocations' in i['keywords'][x]['name']:
                locations.append(i['keywords'][x]['value'])
        dic['locations'] = locations
        # subject
        subjects = []
        for x in range(0,len(i['keywords'])):
            if 'subject' in i['keywords'][x]['name']:
                subjects.append(i['keywords'][x]['value'])
        dic['subjects'] = subjects   
        news.append(dic)
    return(news)



In [7]:

    
def get_articles(date):
    '''
    This function accepts a year in string format (e.g.'1980')
    and will return a list of parsed articles about women
    (in dictionaries) for that year.
    '''
    all_articles = []
    for i in range(0,100): #NYT limits pager to first 100 pages. But rarely will you find over 100 pages of results anyway.
        articles = api.search(
               fq = {'source':['Reuters','AP', 'The New York Times'],'subject.contains':'women'},
               begin_date = date + '0101',
               end_date = date + '1231',
               sort='oldest',
               page = str(i))
        articles = parse_articles(articles)
        all_articles = all_articles + articles
    return(all_articles)



In [16]:

    
women_all = []
for i in range(1980,2015):
    print 'Processing' + str(i) + '...'
    women_year =  get_articles(str(i))
    women_all = women_all + women_year









    



Processing1980...
Processing1981...
Processing1982...
Processing1983...
Processing1984...
Processing1985...
Processing1986...
Processing1987...
Processing1988...
Processing1989...
Processing1990...
Processing1991...
Processing1992...
Processing1993...
Processing1994...
Processing1995...
Processing1996...
Processing1997...
Processing1998...
Processing1999...
Processing2000...
Processing2001...
Processing2002...
Processing2003...
Processing2004...
Processing2005...
Processing2006...
Processing2007...
Processing2008...
Processing2009...
Processing2010...
Processing2011...
Processing2012...
Processing2013...
Processing2014...



In [29]:

    
women_all[2]









    Out[29]:





{'abstract': "Louis Auchincloss letter holds zealots of women's liberation who propose abolition of all sexually segregated clubs may be hurting both sexes; drawing",
 'date': u'1980-01-06',
 'desk': None,
 'headline': "Letters; Our Self-Inflicted Evil Image in Iran A Korean Assassin Who Must Not Die Brodskys for Tonka To Dine and Talk in Sexual Segregation The Crusaders and the Pragmatists Why 'The Brethren'? Nuclear Power Installations Belong Underground",
 'id': u'4fc4a76745c1498b0dac107b',
 'locations': [u'SOUTH KOREA', u'UNITED STATES', u'IRAN', u'UNITED STATES'],
 'section': None,
 'snippet': "Louis Auchincloss letter holds zealots of women's liberation who propose abolition of all sexually segregated clubs may be hurting both sexes; drawing",
 'source': u'The New York Times',
 'subjects': [u'ATOMIC ENERGY AND WEAPONS',
  u'ELECTRIC LIGHT AND POWER',
  u'PRESIDENTIAL ELECTION OF 1980',
  u'PRESIDENTIAL ASPIRANTS AND PRE-CONVENTION CAMPAIGN',
  u'ORGANIZATIONS, SOCIETIES AND CLUBS',
  u'WOMEN'],
 'type': u'Letter',
 'url': u'http://query.nytimes.com/gst/abstract.html?res=9C0CE7D91330E631A25755C0A9679C94619FD6CF',
 'word_count': 1996}



In [26]:

    
for i in women_all[:100]:
    x = i['locations']
    for s in x: 
        if s != "UNITED STATES":
            print s









    



TEXAS
MISSOURI
ILLINOIS
SOUTH KOREA
IRAN
NEW YORK CITY
IRAN
AFGHANISTAN
AFGHANISTAN
AFGHANISTAN
AFGHANISTAN
AFGHANISTAN
AFGHANISTAN
NEW YORK STATE
NEW YORK CITY
VIETNAM
NEW YORK CITY
NEW YORK CITY
NEW YORK CITY
JORDAN, HASHEMITE KINGDOM OF
AFGHANISTAN
AFGHANISTAN
POLAND
POLAND
PHILADELPHIA (PA)
ILLINOIS
OKLAHOMA
MIDDLE EAST
MICHIGAN
AFGHANISTAN
NEW YORK CITY
MIDDLE EAST
MIDDLE EAST
NEW YORK CITY
UNION OF SOVIET SOCIALIST REPUBLICS
AFGHANISTAN
AFGHANISTAN
NEW JERSEY
NEWARK MUSEUM (NJ)
GREAT BRITAIN
AFGHANISTAN
IRAN
YUGOSLAVIA
NEW YORK STATE
ISRAEL, STATE OF
ISRAEL, STATE OF
PERU
PERU
YUGOSLAVIA
THAILAND
VENEZUELA
VENEZUELA
CALIFORNIA
CALIFORNIA
CALIFORNIA
MISSOURI
NEW JERSEY
NEW JERSEY
NEW YORK STATE
NEW JERSEY
FIRST BAPTIST CHURCH (WASHINGTON, DC)
SPAIN
NEW JERSEY
CAMBODIA
CUBA
VIRGINIA
TEXAS
ALABAMA



In [ ]: