In [1]:
# Rate of growth of dl vs cs
# "Deep learning is shock to the landscape"
# Comparitive advantage index (RCA): (number of papers per region) / sqrt(number of papers)
# or Subset papers by number of citations then (n papers per author/institutes) / (number total papers)
# 2010 vs 2015 vs change(number of citations)
# 2010 vs 2015 by institute (could do change in RCA on a map)

# Quickly plot # citations vs time since publication --> then try to correct for this

In [104]:
%matplotlib inline
import pandas as pd
from random import choice
from copy import deepcopy
import numpy as np
import seaborn as sns
import matplotlib.dates as mdate
from matplotlib import pyplot as plt
import calendar
import time
from mpl_toolkits.basemap import Basemap
from datetime import timedelta
import matplotlib.patches as patches
import datetime
from datetime import timedelta
from scipy.optimize import curve_fit
import matplotlib.cm as cm

In [3]:
# Date formatting info
date_fmt = "%b '%y" # eg Jan '88
date_formatter = mdate.DateFormatter(date_fmt)

In [4]:
df_topics = pd.read_csv("/Users/hep/Downloads/deep_learning_NN_topics.csv")
df = pd.read_json("/Users/hep/Downloads/CS_STATS_id_title_tag_MAK-matched_GRID-matched.json")
df = df.join(df_topics.set_index("id"),on="pid")

In [5]:
# Read the file, drop non-matches and assign a random topic
#df = pd.read_json("/Users/hep/Downloads/CS_STATS_id_title_tag_MAK-matched_GRID-matched.json")
df = df.loc[df.matched==True]
df.sort_values("date",inplace=True)
df["topic"] = df["is_Deep_learning"].apply(lambda x : str(x))
df["fixed_date"] = df.date.apply(lambda d : mdate.epoch2num(d/1000))
df["month"] = df.date.apply(lambda x: time.gmtime(x/1000).tm_mon)
df["year"] = df.date.apply(lambda x: time.gmtime(x/1000).tm_year)
df["quarter"] = np.ceil(df.month/3)

# Generate one row per institute
rows = []
for _,row in df.iterrows():
    insts = row["institutes"]
    lat_lon_scores = row["lat_lon_score"]
    # Match lat/lon/score/institute
    for inst,(lat,lon,score) in zip(insts,lat_lon_scores):
        # Create the new row and drop redundant data 
        _row = deepcopy(row)
        _row.drop(["institutes","lat_lon_score","matched"],inplace=True)
        _row["institute"] = inst
        _row["lat"] = lat
        _row["lon"] = lon
        _row["score"] = score
        rows.append(_row)

# Clear the cache and build a new dataframe
del df
df = pd.DataFrame(rows)
_df = deepcopy(df.loc[df.score > 0.99])

In [6]:
# Make a plot of number of citations to understand "rare" events
fig, ax = plt.subplots(figsize=(10,5))

# Loop over 
for _topic,grouped in _df.drop_duplicates("pid").groupby("topic"):
    ax = grouped.plot.line("fixed_date","citations",ax=ax,label=_topic)

# Format the labels
ax.set_title("Citations by topic")
ax.legend().set_title("Topic")
ax.set_ylabel("Citations per publication")
ax.set_xlabel("Date")
ax.xaxis_date()
ax.xaxis.set_major_formatter(date_formatter)
fig.autofmt_xdate()



In [12]:
'''Function to plot lat lon coordinates on a map'''
def make_map(df,ax=None,color="orange",alpha=0.3,s=1,embolden=None,**kwargs):
    # Convert lat lon to lists
    lats = list(df.lat)
    lons = list(df.lon)
    # Hack to brighten up markers: plot the marker <embolden> times
    if embolden:
        lats = lats*embolden
        lons = lons*embolden
    # If an axis hasn't been passed, create one
    if ax == None:
        fig,ax = plt.subplots(figsize=(10,5))
    # Make the basemap
    m = Basemap(projection='merc',llcrnrlat=-60,urcrnrlat=65,
                llcrnrlon=-170,urcrnrlon=190,lat_ts=2,resolution='c',)
    m.drawcoastlines(linewidth=0.05,color="white")
    m.drawcountries(linewidth=0.075,color="white")
    m.drawmapboundary(fill_color='black',linewidth=0)
    # Convert the coordiates to this projection
    lons,lats = m(lons,lats)
    # Scatter the data
    m.scatter(x=lons,y=lats,ax=ax,
              latlon=False,alpha=alpha,s=s,
              color=color,**kwargs)
    return ax

In [13]:
'''Test whether the dataframe row "test" is within 
6 months (before or after) dataframe row "event"'''
def within_six_months(test,event,before=True):
    # Get time delta (note "dates" are in epoch seconds)
    seconds = float(test.date - event.date)/1000
    delta = timedelta(seconds=seconds)
    # Convert to "months" (this is kind of rough)
    n_months = 12 * delta.days / 365
    # Evaluate
    if before:
        return n_months > -6 and n_months < 0
    else:
        return n_months < 6 and n_months > 0

In [32]:
'''Plot a map of dataframe rows within 6 months of the rows in 
"grouped_event", which are "rare events" with large numbers of
citations"
'''
def make_and_customise_map(grouped,grouped_event,before,f,popt):
    # Pull out any of the rare events, for the date
    event = grouped_event.head(n=1)        
    event_date = datetime.datetime.fromtimestamp(event.date/1000)
    # Get the dataframe rows of grouped within 6 months of event
    condition = grouped.apply(func=within_six_months,
                              args=(event,before),axis=1)
    # Plot the rows on a map
    ax = make_map(grouped.loc[condition])
    make_map(grouped_event,ax,color="lime",embolden=100,
             alpha=1,s=20,marker=(5,2),zorder=10)

    # Generate prediction data for the two quarters before/after the event
    if before:
        title_text = "6 months before publication of '"+str(event.title.values[0])+"'"
        _epochs = [x.timestamp() for x in _dates
                   if (x > event_date - timedelta(days=365/2)) and (x < event_date)]
    else:
        title_text = "6 months after publication of '"+str(event.title.values[0])+"'"
        _epochs = [x.timestamp() for x in _dates
                   if (x < event_date + timedelta(days=365/2)) and (x > event_date)]
    
    # Calculate the % difference between total and expected
    total = condition.sum()
    expected = np.ceil(sum(func(_epochs,*popt)))
    pc_diff = np.ceil((total/expected - 1)*100)
    sign = ""
    if pc_diff > 0:
        sign = "+"
    
    # Top title
    ax.text(5e5, 17e6, title_text, color="plum", fontsize=12, weight="bold")

    # Summary of data
    ax.text(5e5, 2.5e6, "Total papers on topic: "+str(condition.sum()), color="white")
    ax.text(5e5, 1.5e6, "Expected: "+str(int(expected))+" ("+sign+str(int(pc_diff))+"%)", color="white")

    # 'Legend'
    ax.text(1.6e7, 3e6, "Institutes involved in the highly cited paper", color="white")
    ax.text(1.6e7, 2e6, "Institutes with new papers on topic", color="white")
    ax.scatter([1.55e7]*100,[3.2e6]*100,color="lime",marker=(5,2))
    ax.scatter([1.55e7]*100,[2.2e6]*100,color="orange",marker="+")
    ax.add_patch(patches.Rectangle((1.5e7,1.1e6),1.5e7,3.1e6,
                                   fill=False,edgecolor="white"))
    
    return ax

In [55]:
# Order of polynomial to fit
#polyn = 8

def func(x, a, b, c):
    x = np.asarray(x)/1e8
    return a*x * np.exp(b + c*x)

# Plot number of publications by date per topic, identifying rare events
for _topic,grouped in _df.groupby("topic"):
    if _topic == "0":
        continue
    # Aggregate grouped by date and sum
    _counts = []
    _dates = []
    for (_year,_quarter),_grouped in grouped.groupby(["year","quarter"]):
        if _year < 1990:
            continue
        _counts.append(len(_grouped))
        _dates.append(datetime.datetime.strptime(str(int(3*(_quarter))-2)+" "+str(_year),'%m %Y'))
    _dates = _dates[:-2]
    _counts = _counts[:-2]
    
    # Plot counts per quarter
    fig,ax = plt.subplots(figsize=(10,5))
    ax.plot(_dates,_counts,label="Data") # Drop the final quarter
    ax.set_ylabel("Number of papers on arXiv per quarter")
    ax.set_xlabel("Journal publication date")
    ax.xaxis_date()
    ax.xaxis.set_major_formatter(date_formatter)
    fig.autofmt_xdate()

    ax.set_ylim(0,ax.get_ylim()[1])
    
    # Fit a trend line through counts and plot
    _epochs = [x.timestamp() for x in _dates]
    
    _training_epochs = []
    _training_counts = []
    for c,d in zip(_counts,_dates):
        if d.year < 2005:
            continue
        _training_epochs.append(d.timestamp())
        _training_counts.append(c)
    popt, pcov = curve_fit(func, _training_epochs, _training_counts, p0 = (0.1,0.,1.))
    #p = func(_epochs,*popt)
    
    #z = np.polyfit(_epochs,_counts,polyn)
    #p = np.poly1d(z)
    ax.plot(_dates,func(_epochs,*popt),label="Polynomial fit")
    
    # Define a large citation as any in the top 0.01%
    large_citation = np.percentile(grouped.citations,99.95)
    print(_topic,large_citation,len(grouped))

    # Get rare events and draw these in at the right vertical level
    rare_events = grouped.loc[grouped.citations > large_citation]
    added_label = False
    for _,event in rare_events.iterrows():
        # Count the number of publications at this date
        _yr = grouped.year == event.year 
        _mn = grouped.quarter == event.quarter
        _count = (_yr & _mn).sum()
        _date = datetime.datetime.fromtimestamp(event.date/1000)
        label = None
        if not added_label:
            label = "Publication date of paper with large number of citations"
            added_label = True
        ax.plot([_date, _date], (_count-300,_count+300), 
                'k-',c="red",label=label)
    # Draw the legend
    ax.legend(loc=2)
    
    # Also generate ratio
    fig,ax = plt.subplots(figsize=(10,5))
    ax.plot(_dates, (_counts / func(_epochs,*popt)))
    ax.plot( [min(_dates),max(_dates)], (1,1), "k--",c="black")    

    # Plot ratio of actual / expected
    for _,event in rare_events.iterrows():
        # Count the number of publications at this date
        _yr = grouped.year == event.year 
        _mn = grouped.quarter == event.quarter
        _count = (_yr & _mn).sum()
        _guess = func(event.date/1000,*popt)
        _date = datetime.datetime.fromtimestamp(event.date/1000)
        ax.plot([_date, _date], (0,2), 'k-',c="red")

    # Group the rare events by arXiv ID
    for _,grouped_event in rare_events.groupby("pid"):
        # Plot 6 months before rare event
        make_and_customise_map(grouped=grouped,
                               grouped_event=grouped_event,
                               before=True,f=func,popt=popt)
        # Plot 6 months after rare event
        make_and_customise_map(grouped=grouped,
                               grouped_event=grouped_event,
                               before=False,f=func,popt=popt)

        # Only testing for now, so break out
        #break
    # Only testing for now, so break out
    #break


1 1954.0 19112

In [49]:
popt,pcov


Out[49]:
(array([  9.28738601e-07,  -2.00407530e+00,   1.38988957e+00]),
 array([[ -4.40352680e+03,   4.75103267e+09,   7.00482838e+00],
        [  4.75103267e+09,  -5.12596209e+15,  -7.55761704e+06],
        [  7.00482838e+00,  -7.55761704e+06,   0.00000000e+00]]))

In [56]:
df.to_csv("~/Downloads/compsci_deeplearning_geocitations.csv")
_df.to_csv("/Users/hep/Downloads/compsci_deeplearning_geocitations_goodmatch.csv")

In [65]:
_df.drop("Unnamed: 0",axis=1,inplace=True)

In [66]:
_df.columns


Out[66]:
Index(['citations', 'date', 'pid', 'title', 'publication_year', 'Neural_nets',
       'Deep_learning', 'is_Deep_learning', 'topic', 'fixed_date', 'month',
       'year', 'quarter', 'institute', 'lat', 'lon', 'score'],
      dtype='object')

In [68]:
len(_df)/len(df)


Out[68]:
0.8106940687021166

In [70]:
len(_df)


Out[70]:
142640

In [71]:
len(df_topics)


Out[71]:
134324

In [73]:
sum(pd.read_json("/Users/hep/Downloads/CS_STATS_id_title_tag_MAK-matched_GRID-matched.json").matched == True)


Out[73]:
122198

In [74]:
122198/134324


Out[74]:
0.909725737768381

In [75]:
import arxiv

In [103]:
def chunks(l, n=400):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
         yield l[i:i + n]

datas = []
for chunk in chunks(id_list):
    _data = {result["id"]:[tag["term"] for tag in result["tags"]] for result in arxiv.query(id_list=chunk)}
    datas.append(_data)
data = dict(**datas)


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-103-c226013cab2d> in <module>()
      6 datas = []
      7 for chunk in chunks(id_list):
----> 8     _data = {result["id"]:[tag["term"] for tag in result["tags"]] for result in arxiv.query(id_list=chunk)}
      9     datas.append(_data)
     10 data = dict(**datas)

//anaconda/envs/py35/lib/python3.5/site-packages/arxiv/arxiv.py in query(search_query, id_list, prune, start, max_results)
     23                                  "start": start,
     24                                  "max_results": max_results})
---> 25     results = feedparser.parse(root_url + 'query?' + url_args)
     26     if results.get('status') != 200:
     27         # TODO: better error reporting

//anaconda/envs/py35/lib/python3.5/site-packages/feedparser.py in parse(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers, response_headers)
   3839         handlers = [handlers]
   3840     try:
-> 3841         f = _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
   3842         data = f.read()
   3843     except Exception as e:

//anaconda/envs/py35/lib/python3.5/site-packages/feedparser.py in _open_resource(url_file_stream_or_string, etag, modified, agent, referrer, handlers, request_headers)
   2864         opener.addheaders = [] # RMK - must clear so we only send our custom User-Agent
   2865         try:
-> 2866             return opener.open(request)
   2867         finally:
   2868             opener.close() # JohnD

//anaconda/envs/py35/lib/python3.5/urllib/request.py in open(self, fullurl, data, timeout)
    464             req = meth(req)
    465 
--> 466         response = self._open(req, data)
    467 
    468         # post-process response

//anaconda/envs/py35/lib/python3.5/urllib/request.py in _open(self, req, data)
    482         protocol = req.type
    483         result = self._call_chain(self.handle_open, protocol, protocol +
--> 484                                   '_open', req)
    485         if result:
    486             return result

//anaconda/envs/py35/lib/python3.5/urllib/request.py in _call_chain(self, chain, kind, meth_name, *args)
    442         for handler in handlers:
    443             func = getattr(handler, meth_name)
--> 444             result = func(*args)
    445             if result is not None:
    446                 return result

//anaconda/envs/py35/lib/python3.5/urllib/request.py in http_open(self, req)
   1280 
   1281     def http_open(self, req):
-> 1282         return self.do_open(http.client.HTTPConnection, req)
   1283 
   1284     http_request = AbstractHTTPHandler.do_request_

//anaconda/envs/py35/lib/python3.5/urllib/request.py in do_open(self, http_class, req, **http_conn_args)
   1255             except OSError as err: # timeout error
   1256                 raise URLError(err)
-> 1257             r = h.getresponse()
   1258         except:
   1259             h.close()

//anaconda/envs/py35/lib/python3.5/http/client.py in getresponse(self)
   1195         try:
   1196             try:
-> 1197                 response.begin()
   1198             except ConnectionError:
   1199                 self.close()

//anaconda/envs/py35/lib/python3.5/http/client.py in begin(self)
    295         # read until we get a non-100 response
    296         while True:
--> 297             version, status, reason = self._read_status()
    298             if status != CONTINUE:
    299                 break

//anaconda/envs/py35/lib/python3.5/http/client.py in _read_status(self)
    256 
    257     def _read_status(self):
--> 258         line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
    259         if len(line) > _MAXLINE:
    260             raise LineTooLong("status line")

//anaconda/envs/py35/lib/python3.5/socket.py in readinto(self, b)
    573         while True:
    574             try:
--> 575                 return self._sock.recv_into(b)
    576             except timeout:
    577                 self._timeout_occurred = True

KeyboardInterrupt: 

In [ ]:
with open('~/Downloads/arxiv_tagterm.json', 'w') as fp:
    json.dump(data, fp)

In [ ]: