In [197]:
%matplotlib inline
import matplotlib.pyplot as plt
import requests
import BeautifulSoup as soup
import pandas as pd
import numpy as np
from scipy.stats import binom

In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_human_evolution_fossils'
r = requests.get(url)
r.status_code


Out[3]:
200

In [4]:
s = soup.BeautifulSoup(r.text)

In [8]:
tables = s.findAll('table')

In [26]:
str(tables[1])


Out[26]:
'<table class="wikitable sortable">\n<tr style="background:#efefef;">\n<th></th>\n<th>Name</th>\n<th>Age</th>\n<th>Species</th>\n<th>Year<br />\ndiscovered</th>\n<th>Country</th>\n<th>Discovered by</th>\n<th>Now located at</th>\n</tr>\n<tr>\n<td><a href="/wiki/File:Sahelanthropus_tchadensis_-_TM_266-01-060-1.jpg" class="image"><img alt="Sahelanthropus tchadensis - TM 266-01-060-1.jpg" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fc/Sahelanthropus_tchadensis_-_TM_266-01-060-1.jpg/100px-Sahelanthropus_tchadensis_-_TM_266-01-060-1.jpg" width="100" height="84" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fc/Sahelanthropus_tchadensis_-_TM_266-01-060-1.jpg/150px-Sahelanthropus_tchadensis_-_TM_266-01-060-1.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/f/fc/Sahelanthropus_tchadensis_-_TM_266-01-060-1.jpg/200px-Sahelanthropus_tchadensis_-_TM_266-01-060-1.jpg 2x" data-file-width="4239" data-file-height="3571" /></a></td>\n<td><a href="/wiki/Sahelanthropus" title="Sahelanthropus">TM 266 (Toumai)</a></td>\n<td>7 Ma<sup id="cite_ref-2" class="reference"><a href="#cite_note-2"><span>[</span>2<span>]</span></a></sup></td>\n<td><i><a href="/wiki/Sahelanthropus" title="Sahelanthropus">Sahelanthropus tchadensis</a></i></td>\n<td>2001</td>\n<td><a href="/wiki/Chad" title="Chad">Chad</a></td>\n<td><a href="/wiki/Michel_Brunet_(paleontologist)" title="Michel Brunet (paleontologist)">Michel Brunet</a> Alain Beauvilain, Fanone Gongdibe, Mahamat Adoum and Ahounta Djimdoumalbaye</td>\n<td></td>\n</tr>\n<tr>\n<td><a href="/wiki/File:BAR_1002_ororin_Credit_Chip_Clark,_Smithsonian_Institution.JPG" class="image"><img alt="BAR 1002 ororin Credit Chip Clark, Smithsonian Institution.JPG" src="//upload.wikimedia.org/wikipedia/commons/thumb/6/6d/BAR_1002_ororin_Credit_Chip_Clark%2C_Smithsonian_Institution.JPG/100px-BAR_1002_ororin_Credit_Chip_Clark%2C_Smithsonian_Institution.JPG" width="100" height="151" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/6/6d/BAR_1002_ororin_Credit_Chip_Clark%2C_Smithsonian_Institution.JPG/150px-BAR_1002_ororin_Credit_Chip_Clark%2C_Smithsonian_Institution.JPG 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/6/6d/BAR_1002_ororin_Credit_Chip_Clark%2C_Smithsonian_Institution.JPG/200px-BAR_1002_ororin_Credit_Chip_Clark%2C_Smithsonian_Institution.JPG 2x" data-file-width="343" data-file-height="518" /></a></td>\n<td><a href="/wiki/Orrorin" title="Orrorin">BAR 1000\'00</a></td>\n<td>6 Ma<sup id="cite_ref-3" class="reference"><a href="#cite_note-3"><span>[</span>3<span>]</span></a></sup></td>\n<td><i><a href="/wiki/Orrorin" title="Orrorin">Orrorin tugenensis</a></i></td>\n<td>2000</td>\n<td><a href="/wiki/Kenya" title="Kenya">Kenya</a></td>\n<td><a href="/wiki/Martin_Pickford" title="Martin Pickford">Martin Pickford</a>, Kiptalam Cheboi, Dominique Gommery, Pierre Mein, Brigitte Senut,</td>\n<td></td>\n</tr>\n<tr>\n<td></td>\n<td>ALA-VP 1/20<sup id="cite_ref-4" class="reference"><a href="#cite_note-4"><span>[</span>4<span>]</span></a></sup></td>\n<td>5.5-5.8 Ma</td>\n<td><i><a href="/wiki/Ardipithecus#Ardipithecus_kadabba" title="Ardipithecus">Ardipithecus kadabba</a></i></td>\n<td>1997</td>\n<td><a href="/wiki/Ethiopia" title="Ethiopia">Ethiopia</a></td>\n<td><a href="/wiki/Yohannes_Haile-Selassie" title="Yohannes Haile-Selassie">Yohannes Haile-Selassie</a></td>\n<td></td>\n</tr>\n</table>'

In [99]:
dfs = pd.read_html(str(tables[1]))
df = dfs[0]
df.columns = df.iloc[0]
df = df[1:df.shape[0]]
for i in range(1, 7):
    dfs = pd.read_html(str(tables[i]))
    df2 = dfs[0]
    df2.columns = df2.iloc[0]
    df2 = df2[1:df2.shape[0]]
    df = df.append(df2)

In [100]:
def clean_year(x):
    """A fairly sloppy way of cleaning up this data"""
    return x
    x = str(x)
    i = x.find(', ')
    if i != -1:
        return int(x[i+2:len(x)])
    i = x.find('-')
    if i != -1:
        return int(x[0:i])
    return int(x)

df['Year discovered'] = df['Year discovered'].apply(clean_year)
df['c'] = 1

In [101]:
df.groupby(['Year discovered'])['c'].count()


Out[101]:
Year discovered
1899         1
1903         1
1908         1
1909         2
1914         1
1920         1
1921         1
1929         2
1930         1
1931         2
1933         3
1938         1
1947         1
1960         2
1961         1
1963         1
1965         1
1965-1975    1
1967         3
1968         1
1973         1
1974         2
1975         1
1976         1
1978         1
1981         1
1982         1
1983         1
1984         1
1991         4
1992         1
1993         1
1994         2
1995         1
1997         3
1999         1
2000         3
2001         2
2003         1
2005         1
2013         1
2015         1
Name: c, dtype: int64

In [146]:
filter = df['Species']=='Homo sapiens'
humans = df[filter]

In [213]:
humans.shape


Out[213]:
(41, 15)

In [147]:
humans.groupby(['Country'])['c'].count()


Out[147]:
Country
Algeria                     1
Australia                   4
Brazil                      1
Chile                       1
Czech Republic              1
Egypt                       1
Ethiopia                    2
France                      3
Indonesia                   1
Israel                      4
Japan                       2
Mexico                      1
Minnesota, United States    1
Morocco                     4
Nigeria                     1
Romania                     1
South Africa                2
Sri Lanka                   1
Sudan                       1
UK                          5
United States               3
Name: c, dtype: int64

In [152]:
humans = humans.dropna(subset=['Age'])

In [153]:
def age_clean(x):
    x = str(x)
    i = x.find('[')
    if i != -1:
        x = x[0:i]
    x = x.replace('k', '')
    x = x.replace('~', '-')
    i = x.find('-')
    if i >= 0:
        arr = x.split('-')
        if arr[0]=='':
            return -1 * float(arr[1])
        return -1 * (float(arr[0]) + float(arr[1]))/2
    return -1 * float(x)

humans['Age'] = humans['Age'].apply(age_clean)

In [160]:
humans.sort('Age', inplace=True)
humans['Age']
bins = np.arange(-200, 10, 25)
h = np.histogram(humans['Age'], bins=bins)
h


Out[160]:
(array([ 2,  4,  0,  0,  5,  0,  9, 21]),
 array([-200, -175, -150, -125, -100,  -75,  -50,  -25,    0]))

In [173]:
x = np.arange(len(h[0]))
plt.figure(figsize=(15,5))
plt.bar(x, h[0])
plt.xticks(x, bins[0:len(bins)-1])
plt.xlabel('Age of fossil (in thosands of years)')
plt.ylabel('Fossils found')
plt.gca().xaxis.grid(False)
plt.show()



In [191]:
# consider scaling based on https://en.wikipedia.org/wiki/World_population
humans['Population'] = 0.015 * 1000000
humans.loc[humans['Age'] >= -10, 'Population'] = 4 * 1000000
humans.loc[humans['Age'] >= -8, 'Population'] = 5 * 1000000
humans[['Population', 'Age']]
fossils_by_pop = pd.DataFrame(humans.groupby(['Population'])['c'].sum())

In [192]:
fossils_by_pop


Out[192]:
c
Population
15000 32
4000000 6
5000000 3

In [193]:



Out[193]:
nan nan nan Age Country Date discovered Discovered by Image Name Now Located at Now located at Species Year discovered c Population
7 [56] [56] NaN -190.00 Ethiopia NaN Richard Leakey NaN Omo 2 NaN NaN Homo sapiens NaN 1 15000
6 [55] [55] NaN -190.00 Ethiopia NaN Richard Leakey NaN Omo 1 NaN NaN Homo sapiens 1967 1 15000
12 NaN NaN NaN -160.00 Morocco NaN NaN NaN Jebel Irhoud 4 NaN NaN Homo sapiens 1991 1 15000
11 [59] [59] NaN -160.00 Morocco NaN NaN NaN Jebel Irhoud 3 NaN NaN Homo sapiens 1991 1 15000
9 [58] [58] NaN -160.00 Morocco NaN NaN NaN Jebel Irhoud 1 NaN NaN Homo sapiens 1991 1 15000
10 NaN NaN NaN -160.00 Morocco NaN NaN NaN Jebel Irhoud 2 NaN NaN Homo sapiens 1991 1 15000
22 NaN NaN NaN -100.00 Israel NaN NaN NaN Skhul IX NaN NaN Homo sapiens NaN 1 15000
21 NaN NaN NaN -100.00 Israel NaN T. McCown and H. Moivus, Jr. NaN Skhul V NaN NaN Homo sapiens 1933 1 15000
23 [70] [70] NaN -100.00 South Africa NaN Ray Inskeep, Robin Singer, John Wymer, Hilary ... NaN Klasies River Caves NaN NaN Homo sapiens 1960 1 15000
17 NaN NaN NaN -95.00 Israel NaN R. Neuville M Stekelis NaN Qafzeh 6[67] NaN NaN Homo sapiens 1930 1 15000
19 [69] [69] NaN -95.00 Israel NaN T. McCown and H. Moivus, Jr. NaN Qafzeh VI NaN NaN Homo sapiens 1933 1 15000
6 NaN NaN NaN -50.00 Australia 1974 NaN NaN Mungo Man NaN NaN Homo sapiens NaN 1 15000
4 NaN NaN NaN -43.00 UK 1927 NaN NaN Kents Cavern NaN NaN Homo sapiens NaN 1 15000
12 NaN NaN NaN -37.00 Sri Lanka 2012 NaN NaN Balangoda man NaN NaN Homo sapiens NaN 1 15000
13 NaN NaN NaN -36.00 South Africa 1952 NaN NaN Hofmeyr Skull NaN NaN Homo sapiens NaN 1 15000
14 [79] [79] NaN -36.00 Romania 2002 NaN NaN Peștera cu Oase NaN NaN Homo sapiens NaN 1 15000
15 NaN NaN NaN -33.00 UK 1823 William Buckland NaN Red Lady of Paviland NaN NaN Homo sapiens NaN 1 15000
16 NaN NaN NaN -32.00 Japan 1962 NaN NaN Yamashita-Cho Man NaN NaN Homo sapiens NaN 1 15000
21 NaN NaN NaN -30.00 France 1868 Louis Lartet NaN Cro-Magnon 1 NaN NaN Homo sapiens NaN 1 15000
23 NaN NaN NaN -26.00 Czech Republic 1894 K.J. Maska NaN Predmost 3[81] NaN NaN Homo sapiens NaN 1 15000
25 [82] [82] NaN -24.00 UK 1997 NaN NaN Eel Point NaN NaN Homo sapiens NaN 1 15000
28 NaN NaN NaN -17.00 Australia 1967 Duncan Merrilees NaN Tandou[84][85] NaN NaN Homo sapiens NaN 1 15000
27 [83] [83] NaN -17.00 Japan 1970 NaN NaN Minatogawa 1 Anthropology Museum, Tokyo University NaN Homo sapiens NaN 1 15000
29 NaN NaN NaN -14.70 UK 2010 NaN NaN Gough's Cave[86][87] NaN NaN Homo sapiens NaN 1 15000
32 NaN NaN NaN -14.50 France 1888 NaN NaN Chancelade find NaN NaN Homo sapiens NaN 1 15000
7 NaN NaN NaN -14.00 Egypt NaN Fred Wendorf NaN Wadi Kubbaniya[citation needed] NaN NaN Homo sapiens 1982 1 15000
30 NaN NaN NaN -13.00 Nigeria 1965 NaN NaN Iwo Eleru Skull NaN NaN Homo sapiens NaN 1 15000
31 NaN NaN NaN -13.00 United States 1959 Phil Orr NaN Arlington Springs Man NaN NaN Homo sapiens NaN 1 15000
33 NaN NaN NaN -11.50 Brazil 1975 Annette Laming-Emperaire NaN Luzia NaN NaN Homo sapiens NaN 1 15000
34 [92] [92] NaN -11.00 Chile 1936 Junius Bird NaN Cerro Sota 2[93] NaN NaN Homo sapiens NaN 1 15000
4 NaN NaN NaN -11.00 Australia NaN A.G. Thorne NaN Kow Swamp 1 NaN NaN Homo sapiens 1968 1 15000
35 NaN NaN NaN -11.00 Indonesia 1888 B.D. van Rietschoten NaN Wadjak 1[94] NaN NaN Homo sapiens NaN 1 15000
5 NaN NaN NaN -10.00 Algeria NaN C. Arambourg NaN Afalou 13[citation needed] NaN NaN Homo sapiens 1920 1 4000000
6 NaN NaN NaN -10.00 Sudan NaN G. Armelagos, E. Ewing, D. Greene NaN Wadi Halfa 25[citation needed] NaN NaN Homo sapiens 1963 1 4000000
1 NaN NaN NaN -10.00 United States NaN NaN NaN La Brea Woman NaN NaN Homo sapiens 1914 1 4000000
2 NaN NaN NaN -9.60 France NaN O. Hauser NaN Combe Capelle NaN NaN Homo sapiens 1909 1 4000000
3 NaN NaN NaN -9.00 UK NaN NaN NaN Cheddar Man NaN NaN Homo sapiens 1903 1 4000000
11 NaN NaN NaN -8.35 United States NaN M.J. Rogers NaN SDM 16704[citation needed] NaN NaN Homo sapiens 1929 1 4000000
10 NaN NaN NaN -8.00 Mexico NaN H. de Terra NaN Tepexpan man NaN NaN Homo sapiens 1947 1 5000000
8 NaN NaN NaN -7.90 Minnesota, United States NaN Albert Jenks, via construction crew NaN Minnesota Woman NaN NaN Homo sapiens 1931 1 5000000
9 NaN NaN NaN -7.50 NaN NaN H. Robbins B.M. Lynch NaN Lo 4b[96][97] NaN NaN Homo sapiens 1965-1975 1 5000000

In [206]:
fossils_per_population_per_year_human = 32.0 / (15000 * (-10000 - -190000))
fossils_per_population_per_year_human


Out[206]:
1.1851851851851851e-08

In [221]:
events = 100000/550
print events
rv = binom(events, .925)
pr_mvp = rv.pmf(150)
print pr_mvp


181
8.64756621207e-06

In [222]:
rv = binom((-10000 - -190000) * 500, fossils_per_population_per_year_human)
pr_unseen = rv.pmf(0)
print pr_unseen


0.34415378469

In [225]:
.999999*.2*pr_mvp*.05*pr_unseen


Out[225]:
2.9760896641502296e-08

In [ ]:
Pr(x=0, n=189000 | p=fossils_per_population_per_year_human)

In [214]:
#Let's take median values in both cases and assume that 
#750 individuals have a 92.5 % chance of surviving 550 years.

prob_survivial = .925

discrete_evaluations = (-10000 - -190000) / 550
discrete_evaluations

rv = binom(discrete_evaluations, prob_survivial)
rv.pmf(discrete_evaluations)


Out[214]:
8.4790384694028736e-12