In [197]:
%matplotlib inline
import matplotlib.pyplot as plt
import requests
import BeautifulSoup as soup
import pandas as pd
import numpy as np
from scipy.stats import binom
In [3]:
url = 'https://en.wikipedia.org/wiki/List_of_human_evolution_fossils'
r = requests.get(url)
r.status_code
Out[3]:
In [4]:
s = soup.BeautifulSoup(r.text)
In [8]:
tables = s.findAll('table')
In [26]:
str(tables[1])
Out[26]:
In [99]:
dfs = pd.read_html(str(tables[1]))
df = dfs[0]
df.columns = df.iloc[0]
df = df[1:df.shape[0]]
for i in range(1, 7):
dfs = pd.read_html(str(tables[i]))
df2 = dfs[0]
df2.columns = df2.iloc[0]
df2 = df2[1:df2.shape[0]]
df = df.append(df2)
In [100]:
def clean_year(x):
"""A fairly sloppy way of cleaning up this data"""
return x
x = str(x)
i = x.find(', ')
if i != -1:
return int(x[i+2:len(x)])
i = x.find('-')
if i != -1:
return int(x[0:i])
return int(x)
df['Year discovered'] = df['Year discovered'].apply(clean_year)
df['c'] = 1
In [101]:
df.groupby(['Year discovered'])['c'].count()
Out[101]:
In [146]:
filter = df['Species']=='Homo sapiens'
humans = df[filter]
In [213]:
humans.shape
Out[213]:
In [147]:
humans.groupby(['Country'])['c'].count()
Out[147]:
In [152]:
humans = humans.dropna(subset=['Age'])
In [153]:
def age_clean(x):
x = str(x)
i = x.find('[')
if i != -1:
x = x[0:i]
x = x.replace('k', '')
x = x.replace('~', '-')
i = x.find('-')
if i >= 0:
arr = x.split('-')
if arr[0]=='':
return -1 * float(arr[1])
return -1 * (float(arr[0]) + float(arr[1]))/2
return -1 * float(x)
humans['Age'] = humans['Age'].apply(age_clean)
In [160]:
humans.sort('Age', inplace=True)
humans['Age']
bins = np.arange(-200, 10, 25)
h = np.histogram(humans['Age'], bins=bins)
h
Out[160]:
In [173]:
x = np.arange(len(h[0]))
plt.figure(figsize=(15,5))
plt.bar(x, h[0])
plt.xticks(x, bins[0:len(bins)-1])
plt.xlabel('Age of fossil (in thosands of years)')
plt.ylabel('Fossils found')
plt.gca().xaxis.grid(False)
plt.show()
In [191]:
# consider scaling based on https://en.wikipedia.org/wiki/World_population
humans['Population'] = 0.015 * 1000000
humans.loc[humans['Age'] >= -10, 'Population'] = 4 * 1000000
humans.loc[humans['Age'] >= -8, 'Population'] = 5 * 1000000
humans[['Population', 'Age']]
fossils_by_pop = pd.DataFrame(humans.groupby(['Population'])['c'].sum())
In [192]:
fossils_by_pop
Out[192]:
In [193]:
Out[193]:
In [206]:
fossils_per_population_per_year_human = 32.0 / (15000 * (-10000 - -190000))
fossils_per_population_per_year_human
Out[206]:
In [221]:
events = 100000/550
print events
rv = binom(events, .925)
pr_mvp = rv.pmf(150)
print pr_mvp
In [222]:
rv = binom((-10000 - -190000) * 500, fossils_per_population_per_year_human)
pr_unseen = rv.pmf(0)
print pr_unseen
In [225]:
.999999*.2*pr_mvp*.05*pr_unseen
Out[225]:
In [ ]:
Pr(x=0, n=189000 | p=fossils_per_population_per_year_human)
In [214]:
#Let's take median values in both cases and assume that
#750 individuals have a 92.5 % chance of surviving 550 years.
prob_survivial = .925
discrete_evaluations = (-10000 - -190000) / 550
discrete_evaluations
rv = binom(discrete_evaluations, prob_survivial)
rv.pmf(discrete_evaluations)
Out[214]: