In [ ]:
# In case you haven't installed the API
! pip install nytimesarticle
In [1]:
from nytimesarticle import articleAPI
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import datetime
import csv
import math
import time
from ProgressBar import ProgressBar
The consumer confidence index (CCI) is based on survey results of real consumers. They are asked their opinions of current and future economic conditions as well as about their personal economic situation. These survey questions are encoded and normalized to a baseline of 100 coming from the 1985 results. These results are obtained on a monthly basis by the Organisation for Economic Co-operation and Development and can be downloaded directly as a CSV from https://data.oecd.org/leadind/consumer-confidence-index-cci.htm.
The New York Times Article Search API allows for searching and obtaining headlines and lead paragraphs of articles dating back to 1851. Along with each article, there is metadata like the date it was published and the section in which it appeared. There is definitely some possibility that not all articles make it into the database, but an inspection of modern articles finds an order of 10s of articles per day which seems reasonable. The API call returns the data as JSON which can be used as such or transformed into CSV.
To access the API, one needs to obtain an API key from https://developer.nytimes.com/signup. And install the API using:
! pip install nytimesarticle
In [2]:
from nytimesarticle import articleAPI
api = articleAPI('ca372b5c9318406780fe9ebef28e96a1')
The first thing to note are the usage limits for the API. Calls are limited to 1000 per day and 5 per second. Therefore, we need to make sure that our function sleeps between each call. The trickier issue with the API is that it will only return 100 pages of results from any given search. This means that searching for a year long window will have too many results and you will just get the first few weeks which fills the 100 pages. For this reason, we iterate through search windows of one week and monitor the number of pages found to make sure that it never exceeds 100 from any given search.
We will save each year of data as a separate CSV. The steps of the downloading the data to CSV is as follows.
In [3]:
def downloadToFile(startdate, enddate, filename):
"""
Makes API calls to extract id, publication date, headline, and lead paragraph from NY Times articles in the date range.
Then, saves the data to a local file in csv format.
startdate: start of date range to extract (yyyymmdd)
enddate: end of date range to extract (yyyymmdd)
filename: csv file to create and append to
"""
startdate = datetime.datetime.strptime(str(startdate), '%Y%m%d')
enddate = datetime.datetime.strptime(str(enddate), '%Y%m%d')
sliceStart = startdate
while (sliceStart<enddate):
leads = []
ids = []
dates = []
headlines = []
sliceEnd = min(sliceStart + datetime.timedelta(weeks=1), enddate)
sliceStartInt = int(sliceStart.strftime('%Y%m%d'))
sliceEndInt = int(sliceEnd.strftime('%Y%m%d'))
print 'Downloading from {} to {}'.format(sliceStartInt, sliceEndInt)
while True:
try:
numhits = api.search(fl = ['_id'],begin_date = sliceStartInt, end_date=sliceEndInt,fq = {'section_name':'Business'}, page=1)['response']['meta']['hits']
time.sleep(1)
break
except:
print 'JSON error avoided'
pages = int(math.ceil(float(numhits)/10))
time.sleep(1)
pbar2 = ProgressBar(pages)
print '{} pages to download'.format(pages) # Note that you can't download past page number 100
for page in range(1,min(pages+1,100)):
while True:
try:
articles = api.search(fl= ['_id','headline','lead_paragraph','pub_date'], begin_date = sliceStartInt, end_date=sliceEndInt,fq = {'section_name':'Business'}, page=page)
time.sleep(1)
break
except:
print 'JSON error avoided'
pbar2.increment()
for i in articles['response']['docs']:
if (i['lead_paragraph'] is not None) and (i['headline'] != []):
headlines.append(i['headline']['main'])
leads.append(i['lead_paragraph'])
ids.append(i['_id'])
dates.append(i['pub_date'])
pbar2.finish()
sliceStart = sliceEnd
zipped = zip(ids, dates, headlines, leads)
if zipped:
with open(filename, "a") as f:
writer = csv.writer(f)
for line in zipped:
writer.writerow([unicode(s).encode("utf-8") for s in line])
In [4]:
downloadToFile(19900101, 19900115, 'Sample_Output.csv')
Let's just check what we have in the files now. We can iterate over the yearly CSV files to make a dataframe with all of the data.
In [6]:
all_data_list = []
for year in range(1990,1992):
data = pd.read_csv('{}_Output.csv'.format(year), header=None)
all_data_list.append(data) # list of dataframes
data = pd.concat(all_data_list, axis=0)
data.columns = ['id','date','headline', 'lead']
data.head()
Out[6]: