Make granular date instead (using month)

Calculate "time to read" (single human ( av(words) / [16 w/s] ) vs single processor (av(chars) / CPU) )


In [1]:
%matplotlib inline
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
from matplotlib import pyplot as plt
import matplotlib

font = {'size'   : 15}
matplotlib.rc('font', **font)

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Microprocessor_chronology")
soup = BeautifulSoup(r.text,"lxml")

In [3]:
dfs = []
for table in soup.findAll("table"):
    _df = pd.read_html(str(table),header=0)[0]
    _df.rename(columns={"Max clock (first version)":"Clock"},inplace=True)
    dfs.append(_df)
df = pd.concat(dfs)


df = df.loc[~pd.isnull(df["Clock"])]
df = df.loc[~pd.isnull(df["Date"])]

In [4]:
unit_map = {"GHz":1e9,"MHz":1e6,"kHz":1e3}
def get_unit(unit):    
    info = re.findall(r'(\w*Hz\w*)',unit)
    if len(info) > 0:
        return unit_map[info[0]]
    return None

def get_mag(unit):
    info = re.findall(r'(\d+)',str(unit))
    if len(info) > 0:
        return float(info[0])
    return None

df["cpu_unit"] = df["Clock"].apply(get_unit)
df["cpu"] = df["Clock"].apply(get_mag)

df = df.loc[df.cpu > 0]

df["cpu_norm"] = df["cpu"] * df["cpu_unit"]
df["year"] = df["Date"].apply(get_mag)

# Fill in any missing years
df = df.loc[~pd.isnull(df["year"])]
for yr in range(int(df.year.min()),int(df.year.max())+1):
    if yr in df.year.values:
        continue
    max_value = df.loc[df.year <= yr,"cpu_norm"].max()
    print(yr)
    df = df.append(dict(year=yr,cpu_norm=max_value),ignore_index=True)

df["cpu_max"] = np.nan
for i,row in df.iterrows():
    max_value = df.loc[df.year <= row.year,"cpu_norm"].max()
    df.loc[df.year == row.year,"cpu_max"] = max_value
    
df = df.loc[~pd.isnull(df["cpu_norm"])]


ax = df.plot.scatter(y="cpu_max",x="year")


1980
2016

In [5]:
# ax = df.plot.scatter(y="cpu_norm",x="year")
# ax.set_ylim(0,10)

In [6]:
def get_year(date):
    info = re.findall(r'\d+',date)
    if len(info) > 0:
        return info[0]
    return None

df_arxiv = pd.read_csv("https://arxiv.org/stats/get_monthly_submissions")
df_arxiv["year"] = df_arxiv["month"].apply(get_year)
df_arxiv['date'] = pd.to_datetime(df_arxiv['month'],format='%Y-%m')
df_arxiv


Out[6]:
month submissions historical_delta year date
0 1991-07 0 -2 1991 1991-07-01
1 1991-08 27 -1 1991 1991-08-01
2 1991-09 58 0 1991 1991-09-01
3 1991-10 76 0 1991 1991-10-01
4 1991-11 64 0 1991 1991-11-01
5 1991-12 78 0 1991 1991-12-01
6 1992-01 88 -105 1992 1992-01-01
7 1992-02 124 -10 1992 1992-02-01
8 1992-03 117 -3 1992 1992-03-01
9 1992-04 184 -41 1992 1992-04-01
10 1992-05 226 -11 1992 1992-05-01
11 1992-06 232 -5 1992 1992-06-01
12 1992-07 274 -22 1992 1992-07-01
13 1992-08 223 -4 1992 1992-08-01
14 1992-09 315 -21 1992 1992-09-01
15 1992-10 372 -29 1992 1992-10-01
16 1992-11 437 -16 1992 1992-11-01
17 1992-12 394 -10 1992 1992-12-01
18 1993-01 364 -36 1993 1993-01-01
19 1993-02 412 -16 1993 1993-02-01
20 1993-03 495 -9 1993 1993-03-01
21 1993-04 488 -17 1993 1993-04-01
22 1993-05 531 -7 1993 1993-05-01
23 1993-06 520 -16 1993 1993-06-01
24 1993-07 608 -35 1993 1993-07-01
25 1993-08 516 -25 1993 1993-08-01
26 1993-09 502 -16 1993 1993-09-01
27 1993-10 642 -36 1993 1993-10-01
28 1993-11 698 -16 1993 1993-11-01
29 1993-12 723 -15 1993 1993-12-01
... ... ... ... ... ...
286 2015-05 8172 0 2015 2015-05-01
287 2015-06 9217 0 2015 2015-06-01
288 2015-07 8995 0 2015 2015-07-01
289 2015-08 7983 0 2015 2015-08-01
290 2015-09 9318 0 2015 2015-09-01
291 2015-10 9223 0 2015 2015-10-01
292 2015-11 9472 0 2015 2015-11-01
293 2015-12 9376 0 2015 2015-12-01
294 2016-01 8251 0 2016 2016-01-01
295 2016-02 9142 0 2016 2016-02-01
296 2016-03 9746 0 2016 2016-03-01
297 2016-04 8948 0 2016 2016-04-01
298 2016-05 9792 0 2016 2016-05-01
299 2016-06 9644 0 2016 2016-06-01
300 2016-07 8911 0 2016 2016-07-01
301 2016-08 9016 0 2016 2016-08-01
302 2016-09 9869 0 2016 2016-09-01
303 2016-10 10100 0 2016 2016-10-01
304 2016-11 10362 0 2016 2016-11-01
305 2016-12 9599 0 2016 2016-12-01
306 2017-01 9186 0 2017 2017-01-01
307 2017-02 8910 0 2017 2017-02-01
308 2017-03 11008 0 2017 2017-03-01
309 2017-04 9029 0 2017 2017-04-01
310 2017-05 11194 0 2017 2017-05-01
311 2017-06 10297 0 2017 2017-06-01
312 2017-07 9980 0 2017 2017-07-01
313 2017-08 9854 0 2017 2017-08-01
314 2017-09 10517 0 2017 2017-09-01
315 2017-10 2518 0 2017 2017-10-01

316 rows × 5 columns


In [12]:
fig,ax = plt.subplots(figsize=(12,6))
df_arxiv.plot(x="date",y="submissions",legend=False,ax=ax)
ax.set_xlabel("Date")
ax.set_ylabel("Submissions to arXiv")


Out[12]:
<matplotlib.text.Text at 0x112e367b8>

In [17]:
cpu_efficiency = 0.1
words_in_text = 3000
chars_per_word = 10
human_read_speed = 10
day = 86400

def find_cpu(yr):
    cpu = df.loc[df.year == int(yr),"cpu_max"].values
    if len(cpu) > 0:
        return cpu[0]
    return None

df_arxiv["cpu"] = df_arxiv["year"].apply(find_cpu)

df_arxiv["total_subs"] = np.nan 
for i,row in df_arxiv.iterrows():
    before_now = df_arxiv.date <= row.date
    total = df_arxiv.loc[before_now,"submissions"].sum()
    df_arxiv.loc[ df_arxiv.date == row.date,"total_subs"] = total
    
fig,ax = plt.subplots(figsize=(12,6))
df_arxiv.plot(x="date",y="total_subs",legend=False,ax=ax)
ax.set_ylabel("Total submssions on arXiv")
ax.set_xlabel("Date")


Out[17]:
<matplotlib.text.Text at 0x1158a2080>

In [21]:
df_arxiv["time_to_read_cpu"] = (words_in_text * (df_arxiv["total_subs"] / df_arxiv["cpu"]) / cpu_efficiency) / day
df_arxiv["time_to_read_human"] =  (chars_per_word * words_in_text * df_arxiv["total_subs"] / human_read_speed) / day

fig,ax = plt.subplots(figsize=(12,6))
df_arxiv.plot(x="date",y="time_to_read_human",label="A very fast human (%d words per second!)" % human_read_speed,ax=ax)
df_arxiv.plot(x="date",y="time_to_read_cpu",label="Best CPU, reading characters at %d%% efficiency" % (100*cpu_efficiency),ax=ax)
#ax.semilogy()
ax.set_ylabel("Time to read all\narticles on arXiv [days]")
ax.set_xlabel("Date")

fig,ax = plt.subplots(figsize=(12,6))
df_arxiv.plot(x="date",y="time_to_read_human",label="A very fast human (%d words per second!)" % human_read_speed,ax=ax)
df_arxiv.plot(x="date",y="time_to_read_cpu",label="Best CPU, reading characters at %d%% efficiency" % (100*cpu_efficiency),ax=ax)
ax.semilogy()
ax.set_ylabel("Time to read all\narticles on arXiv [days]")
ax.set_xlabel("Date")


Out[21]:
<matplotlib.text.Text at 0x11612ac18>

In [20]:
df_arxiv["cpu_over_human"] = 1e-6 * df_arxiv["time_to_read_human"] / df_arxiv["time_to_read_cpu"] 

fig,ax = plt.subplots(figsize=(12,6))
df_arxiv.plot(x="date",y="cpu_over_human",ax=ax,legend=False)
#ax.semilogy()
ax.set_ylabel("How much slower a human is than CPU\nat reading all of arXiv [factors of millions]")
ax.set_xlabel("Date")


Out[20]:
<matplotlib.text.Text at 0x115b217b8>

In [ ]: