In [1]:
import socket
import os
import time
import urllib.request, urllib.parse, urllib.error
import re
from bs4 import BeautifulSoup
import ssl
import requests
import re

In [2]:
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')

In [1]:
#setup key
key = "d17b30b16180424f8a95060b534aa726"
rl = 'http://api.nytimes.com/svc/archive/v1/{}/{}.json?api-key={}' 
#choose the year and month
url = rl.format(2016,7,key)

In [5]:
#get content from the link,
r = requests.get(url)
article = r.json()

In [256]:
article_rp = article['response']

In [257]:
#retrieve links and ids for all the articles
url_0716 = []
url_0716t = []
for i in article_rp['docs']:
    if i['document_type'] == 'article':
        url_0716.append(i['web_url'])
        url_0716t.append(i['_id'])

In [258]:
len(url_0716),len(url_0716t)


Out[258]:
(5175, 5175)

In [246]:
#save all keys
f = open("key_201607.txt", "a")
for i in url_0716t:
    f.write(i+'\n')
f.close()

In [261]:
#since links starts with query.nytimes do not contain articles, if necessary, drop them
n_match = []
for i in range(len( url_0716)):
    t = len(re.findall('https://query.nytimes.com', url_0716[i]))
    if t>0:
        n_match.append(i)

In [264]:
len(n_match)


Out[264]:
549

In [265]:
#save positions for non-articles
f = open('missing_201607.txt','a')
for i in n_match:
    f.write('%d ' %i)
f.close()

In [212]:
#ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

n_match = []
f = open("article_201607.txt", "a")
a_10 = ''
for j in range(5170,5175):
    #ignore all non-articles
    t = len(re.findall('https://query.nytimes.com', url_0716[j]))
    if t>0:
        n_match.append(j)
        continue
    
    html = urllib.request.urlopen(url_0716[j],context = ctx).read()
    soup = BeautifulSoup(html,'html.parser')
    time.sleep(0.5)
    target = soup.findAll('p', class_=['story-body-text', 'story-content'])
    a1 = ''
    for i in target:
        s = ''.join(str(ii) for ii in i)
        a1 = a1+s+'\n'
    a_10 = a_10 +'%d '%(j+1)+soup.title.string+'\n'+url_0716t[j]+'\n' + a1 + '************\n'
    #print(j)
    if (j+1) % 10 == 0:
        print(j+1)
        f.write(a_10)
        a_10 = ''
#f.write(a_10)
f.close()


5170
5171
5172
5173
5174

Simple Summary


In [275]:
arti_tm = {'0716':7159,'0816':20659,'0916':15399,'1016':18152,'1116':5511,'1216':5247,'0117':5213,'0217':5089,'0317':5706,'0417':5064,'0517':5529,'0617':5366}

In [276]:
arti_gm = {'0716':5175,'0816':4482,'0916':4952,'1016':5218,'1116':4830,'1216':4508,'0117':4587,'0217':4332,'0317':5088,'0417':4581,'0517':4881,'0617':4881}

In [306]:
x1,y1 = zip(*arti_gm.items())

In [304]:
x,y = zip(*arti_tm.items())

In [364]:
fig=plt.figure(figsize=(15, 8), dpi= 200, facecolor='w', edgecolor='k')


plt.plot(list(range(1,13)),y,'r',label = 'online search total %d' %sum(y))
plt.plot(list(range(1,13)),y1,'b',label ='scraped total %d'%sum(y1))
plt.xlabel('month/year',fontsize=20)
plt.ylabel('#articles',fontsize=20)
plt.title('Article Amount Comparison',fontsize=30)
plt.legend(loc='best',fontsize=20)
plt.yticks(fontsize=15)
plt.xticks(np.arange(1, 13, 1.0),x,rotation=45,fontsize=15)
plt.show()



In [341]:
#sometimes link set may contain links not for an article
f = open('/Users/zhanghengqian/Dropbox/Duke/fall-2017/IDS/nytimes_keys/missing_201607.txt','r')
s = f.read()
f.close()

In [343]:
len(s.split())


Out[343]:
549

DATA CLEAN


In [129]:
f = open('/Users/zhanghengqian/Dropbox/Duke/fall-2017/IDS/nytimes_articles/article_201706.txt','r').read()

In [7]:
data = f.split('************')

Multi-string replace function original from https://gist.github.com/bgusach/a967e0587d6e01e889fd1d776c5f3729


In [130]:
def multireplace(string, replacements):
    """
    Given a string and a replacement map, it returns the replaced string.
    :param str string: string to execute replacements on
    :param dict replacements: replacement dictionary {value to find: value to replace}
    :rtype: str
    """
    # Place longer ones first to keep shorter substrings from matching where the longer ones should take place
    # For instance given the replacements {'ab': 'AB', 'abc': 'ABC'} against the string 'hey abc', it should produce
    # 'hey ABC' and not 'hey ABc'
    substrs = sorted(replacements, key=len, reverse=True)

    # Create a big OR regex that matches any of the substrings to replace
    regexp = re.compile('|'.join(map(re.escape, substrs)))

    # For each match, look up the new string in the replacements
    return regexp.sub(lambda match: replacements[match.group(0)], string)

In [131]:
##
short_com = re.compile(r'>(.*?)<')
long_com  = re.compile(r'<a.*?a>')
def create_key(string):
    return dict(zip(long_com.findall(string),short_com.findall(string)))

In [132]:
key = create_key(f)
new_d = multireplace(f,key)

In [133]:
new_dd = new_d.split('************')

In [ ]:
for i in range(4882):
    if new_dd[i].count('<') > 1:
        print(i)

In [165]:
print(f.split('************')[2])


3 Corrections: June 3, 2017 - The New York Times
5931f12195d0e024b587461a
<strong>INTERNATIONAL</strong>
An <a href="https://www.nytimes.com/2017/06/01/world/europe/climate-paris-agreement-trump-china.html">article</a> on Friday about reaction to President Trump’s decision to withdraw the United States from the Paris climate accord attributed a quotation incorrectly. It was Lavanya Rajamani, a professor at the Center for Policy Research in New Delhi — not her colleague Navroz K. Dubash — who said, “Stating that the Paris Agreement hamstrings the U.S. while allowing India and China to increase their emissions is baffling. The Agreement allows every country to choose its pledge tailored to its national circumstances.” The article also referred incorrectly to comments by President Emmanuel Macron of France. In criticizing Mr. Trump’s decision, Mr. Macron spoke in French before switching to English, not the other way around.
<strong>NEW YORK</strong>
An <a href="https://www.nytimes.com/2017/05/26/nyregion/the-new-math-of-parking-placards.html">article</a> last Saturday about parking permits in New York City misstated the number of placards given to city employees, as well as the total number of people employed by the city and the percentage of total placards held by city workers. About 114,600 of the placards were given to city employees, not 160,500. As of March, there were 317,944 full-time city employees, not 293,296, meaning that less than a third of all city employees have parking permits, not more than half.
<strong>SPORTS</strong>
A <a href="https://www.nytimes.com/2017/05/31/sports/baseball/yankees-slugger-aaron-judge-is-third-in-early-al-all-star-voting.html">headline</a> on Thursday with an article about the Yankees’ Aaron Judge referred incorrectly to his position in voting for the All-Star Game. As the article correctly noted, he is third overall, not third in the American League.
<em>•</em>
An <a href="http://www.nytimes.com/2017/05/27/sports/baseball/new-york-mets-pittsburgh-pirates.html" title="Corrected artice">article</a> in some copies on Sunday about the Mets’ 5-4 loss to the Pittsburgh Pirates misidentified the Pittsburgh player who advanced to third base on an Addison Reed wild pitch and scored the tying run in the ninth inning. The player was the pinch-runner Gift Ngoepe — not Jordy Mercer, for whom Ngoepe entered the game.
<strong>WEEKEND</strong>
An <a href="https://www.nytimes.com/2017/06/01/arts/design/wild-waves-and-quiet-streets-2-artists-working-from-memory.html">art review</a> on Friday about two shows at the Metropolitan Museum of Art, “Peder Balke: Painter of Northern Light” and “City of Memory: William Chappel’s Views of Early 19th-Century New York,” misstated the titles of two works by Balke, a Norwegian artist. The works are “The North Cape” (1853) and “The North Cape in the Moonlight” (1848), not “The Cape North” and “Cape North in Moonlight.” The review also misstated when Balke first visited Dresden, Germany. It was 1835-36, not 1833. And the review misstated the number of Balke’s contemporaries who are represented in the Met show. There are three, not four. (They are Johan Christian Dahl, Thomas Fearnley and August Cappelen.)
<strong>OBITUARIES</strong>
An <a href="https://www.nytimes.com/2017/05/24/arts/music/barbara-smith-conrad-dead-mezzo-soprano-broke-race-barrier.html">obituary</a> on May 25 about the mezzo-soprano Barbara Smith Conrad, who as a young woman was denied a role in a University of Texas opera production in 1957 because she was black, referred incorrectly to Ms. Conrad’s survivors, using information from her family. She is survived by a brother, Howard K. Smith; it is not the case that she “leaves no immediate survivors.”
<em>To contact the newsroom regarding correction requests, complaints or other comments about our coverage, please email <a href="mailto:nytnews@nytimes.com">nytnews@nytimes.com</a> or call </em><em>1-844-NYT-NEWS (1-844-698-6397). Comments on editorials may be e-mailed to</em> <em><a href="mailto:letters@nytimes.com">letters@nytimes.com</a></em> <em>or faxed to (212) 556-3622. </em>
<em>For newspaper delivery questions: 1-800-NYTIMES (1-800-698-4637) or e-mail</em> <em><a href="mailto:customercare@nytimes.com">customercare@nytimes.com</a></em><em>.</em>


In [166]:
print(new_dd[2])


3 Corrections: June 3, 2017 - The New York Times
5931f12195d0e024b587461a
<strong>INTERNATIONAL</strong>
An have long spent up to half their year overseas on Friday about reaction to President Trump’s decision to withdraw the United States from the Paris climate accord attributed a quotation incorrectly. It was Lavanya Rajamani, a professor at the Center for Policy Research in New Delhi — not her colleague Navroz K. Dubash — who said, “Stating that the Paris Agreement hamstrings the U.S. while allowing India and China to increase their emissions is baffling. The Agreement allows every country to choose its pledge tailored to its national circumstances.” The article also referred incorrectly to comments by President Emmanuel Macron of France. In criticizing Mr. Trump’s decision, Mr. Macron spoke in French before switching to English, not the other way around.
<strong>NEW YORK</strong>
An INTERNATIONAL last Saturday about parking permits in New York City misstated the number of placards given to city employees, as well as the total number of people employed by the city and the percentage of total placards held by city workers. About 114,600 of the placards were given to city employees, not 160,500. As of March, there were 317,944 full-time city employees, not 293,296, meaning that less than a third of all city employees have parking permits, not more than half.
<strong>SPORTS</strong>
A article on Thursday with an article about the Yankees’ Aaron Judge referred incorrectly to his position in voting for the All-Star Game. As the article correctly noted, he is third overall, not third in the American League.
<em>•</em>
An NEW YORK in some copies on Sunday about the Mets’ 5-4 loss to the Pittsburgh Pirates misidentified the Pittsburgh player who advanced to third base on an Addison Reed wild pitch and scored the tying run in the ninth inning. The player was the pinch-runner Gift Ngoepe — not Jordy Mercer, for whom Ngoepe entered the game.
<strong>WEEKEND</strong>
An article on Friday about two shows at the Metropolitan Museum of Art, “Peder Balke: Painter of Northern Light” and “City of Memory: William Chappel’s Views of Early 19th-Century New York,” misstated the titles of two works by Balke, a Norwegian artist. The works are “The North Cape” (1853) and “The North Cape in the Moonlight” (1848), not “The Cape North” and “Cape North in Moonlight.” The review also misstated when Balke first visited Dresden, Germany. It was 1835-36, not 1833. And the review misstated the number of Balke’s contemporaries who are represented in the Met show. There are three, not four. (They are Johan Christian Dahl, Thomas Fearnley and August Cappelen.)
<strong>OBITUARIES</strong>
An SPORTS on May 25 about the mezzo-soprano Barbara Smith Conrad, who as a young woman was denied a role in a University of Texas opera production in 1957 because she was black, referred incorrectly to Ms. Conrad’s survivors, using information from her family. She is survived by a brother, Howard K. Smith; it is not the case that she “leaves no immediate survivors.”
<em>To contact the newsroom regarding correction requests, complaints or other comments about our coverage, please email  or call </em><em>1-844-NYT-NEWS (1-844-698-6397). Comments on editorials may be e-mailed to</em> <em></em> <em>or faxed to (212) 556-3622. </em>
<em>For newspaper delivery questions: 1-800-NYTIMES (1-800-698-4637) or e-mail</em> <em> are you interested in? Let us know: </em><em>.</em>


In [146]:
len(new_dd)


Out[146]:
4882