In [212]:
import pandas as pd, numpy as np

In [2]:
url='https://en.wikipedia.org/wiki/List_of_International_Space_Station_expeditions'

In [321]:
df=pd.read_html(url)
df=pd.concat(df[:2]).reset_index()

In [322]:
def find_names(s,ppls,z):
    nms=s.split(' ')
    l=2
    while l<4:
        ppl=' '.join(nms[:l])
        if ppl in ppls:
            z.append(ppl)
            rest=' '.join(nms[l:])
            find_names(rest,ppls,z)
            l=4
        l+=1
    return z

In [323]:
dgs=[]
for i in df.index:
    crew=df.loc[i]['Crew'].replace('\n','')
    crews=find_names(crew,ppls,[])
    for c in crews:
        dg=df.loc[[i]][['Expedition','Duration(days)']].copy()
        dg['Crew']=c
        date=df.loc[i]['Launch date']
        date=date.split(' ')
        if ',' in date[1]:
            date=date[1].replace(',','')+' '+date[0]+' '+date[2][:4]
        else:
            date=date[0]+' '+date[1]+' '+date[2][:4]
        dg['Date']=date
        dgs.append(dg)

In [324]:
dgs=pd.concat(dgs).reset_index()
dhs=dgs.set_index(['Expedition','Crew'])

In [325]:
def get_duration(duration):
    default='160'
    duration=str(duration)
    if duration=='nan':
        return default
    if 'ransfer' in duration:
            duration=' '.join(duration.split(' ')[-2:])
            try:
                duration=dhs.loc[duration].loc[crew]['Duration(days)']
            except:
                print(crew,duration)
                return default
    return duration

In [326]:
data=[]
for i in dgs.index:
    crew=dgs.loc[i]['Crew']
    date=dgs.loc[i]['Date']
    if 'ransfer' not in date:
        duration=dgs.loc[i]['Duration(days)']
        duration=get_duration(duration)
        duration=get_duration(duration)
        duration=get_duration(duration)
        if '[' in duration:
            duration=duration[:duration.find('[')]
        duration=int(np.round(float(duration.replace('days','').replace('day','').strip()),0))
        data.append({'Crew':crew,'Date':date,'Duration':duration})


Mikhail Korniyenko year mission
Scott J. Kelly year mission
Timothy Peake Expedition 47
Aleksey Ovchinin Expedition 60
Christina Koch Expedition 60
Nick Hague Expedition 60

In [327]:
data=pd.DataFrame(data)

In [328]:
data.head()


Out[328]:
Crew Date Duration
0 William M. Shepherd 31 October 2000 141
1 Sergei Krikalev 31 October 2000 141
2 Yuri Gidzenko 31 October 2000 141
3 Yuri Usachev 8 March 2001 167
4 James S. Voss 8 March 2001 167

In [329]:
import bs4
import requests
r=requests.get(url)
soup = bs4.BeautifulSoup(r.content)
tables=soup.findAll("table")

In [330]:
links={}
country_map={}
for trs in [tables[0].findAll("tr"),tables[1].findAll("tr")]:
    for i,tr in enumerate(trs):
        if i>0:
            aas=tr.findAll("a")
            for j,a in enumerate(aas):
                if a:
                    txt=a.text
                    if txt:
                        if '[' not in txt:
                            links[txt]=a['href']
                            if txt in data['Crew'].values:
                                if txt not in country_map:
                                    country=aas[j-1].find('img')['alt']
                                    country_map[txt]=country
                    else:
                        if j>1:
                            country=a.find('img')['alt']
                            links[country]=a.find('img')['src']

In [331]:
countries=pd.DataFrame(country_map,index=['Country']).T
links=pd.DataFrame(links,index=['Link']).T

In [332]:
data=data.join(countries,on='Crew')
data['Crew_link']=data.join(links,on='Crew')['Link']
data['Country_link']=data.join(links,on='Country')['Link']

In [333]:
data.to_csv('data.csv')

In [342]:
ndata={}

In [343]:
for i in data.index:
    start=pd.to_datetime(data.loc[i]['Date'])
    periods=data.loc[i]['Duration']
    crew=data.loc[i]['Crew']
    country=data.loc[i]['Country']
    for idate in pd.date_range(start,periods=periods,freq='1D'):
        date=str(idate)[:10]
        if date not in ndata: ndata[date]={}
        if country not in ndata[date]: ndata[date][country]={'crew':[],'count':0}
        ndata[date][country]['crew'].append(crew)
        ndata[date][country]['count']+=1

In [ ]:


In [347]:
pd.DataFrame(ndata)


Out[347]:
2000-10-31 2000-11-01 2000-11-02 2000-11-03 2000-11-04 2000-11-05 2000-11-06 2000-11-07 2000-11-08 2000-11-09 ... 2019-08-11 2019-08-12 2019-08-13 2019-08-14 2019-08-15 2019-08-16 2019-08-17 2019-08-18 2019-08-19 2019-08-20
Belgium NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Canada NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
France NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Germany NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Italy NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Japan NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Netherlands NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
Russia {'crew': ['Sergei Krikalev', 'Yuri Gidzenko'],... {'crew': ['Sergei Krikalev', 'Yuri Gidzenko'],... {'crew': ['Sergei Krikalev', 'Yuri Gidzenko'],... {'crew': ['Sergei Krikalev', 'Yuri Gidzenko'],... {'crew': ['Sergei Krikalev', 'Yuri Gidzenko'],... {'crew': ['Sergei Krikalev', 'Yuri Gidzenko'],... {'crew': ['Sergei Krikalev', 'Yuri Gidzenko'],... {'crew': ['Sergei Krikalev', 'Yuri Gidzenko'],... {'crew': ['Sergei Krikalev', 'Yuri Gidzenko'],... {'crew': ['Sergei Krikalev', 'Yuri Gidzenko'],... ... {'crew': ['Aleksey Ovchinin'], 'count': 1} {'crew': ['Aleksey Ovchinin'], 'count': 1} {'crew': ['Aleksey Ovchinin'], 'count': 1} {'crew': ['Aleksey Ovchinin'], 'count': 1} {'crew': ['Aleksey Ovchinin'], 'count': 1} {'crew': ['Aleksey Ovchinin'], 'count': 1} {'crew': ['Aleksey Ovchinin'], 'count': 1} {'crew': ['Aleksey Ovchinin'], 'count': 1} {'crew': ['Aleksey Ovchinin'], 'count': 1} {'crew': ['Aleksey Ovchinin'], 'count': 1}
United Kingdom NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
United States {'crew': ['William M. Shepherd'], 'count': 1} {'crew': ['William M. Shepherd'], 'count': 1} {'crew': ['William M. Shepherd'], 'count': 1} {'crew': ['William M. Shepherd'], 'count': 1} {'crew': ['William M. Shepherd'], 'count': 1} {'crew': ['William M. Shepherd'], 'count': 1} {'crew': ['William M. Shepherd'], 'count': 1} {'crew': ['William M. Shepherd'], 'count': 1} {'crew': ['William M. Shepherd'], 'count': 1} {'crew': ['William M. Shepherd'], 'count': 1} ... {'crew': ['Christina Koch', 'Nick Hague'], 'co... {'crew': ['Christina Koch', 'Nick Hague'], 'co... {'crew': ['Christina Koch', 'Nick Hague'], 'co... {'crew': ['Christina Koch', 'Nick Hague'], 'co... {'crew': ['Christina Koch', 'Nick Hague'], 'co... {'crew': ['Christina Koch', 'Nick Hague'], 'co... {'crew': ['Christina Koch', 'Nick Hague'], 'co... {'crew': ['Christina Koch', 'Nick Hague'], 'co... {'crew': ['Christina Koch', 'Nick Hague'], 'co... {'crew': ['Christina Koch', 'Nick Hague'], 'co...

10 rows × 6868 columns


In [345]:
import json

In [346]:
open('ndata.json','w').write(json.dumps(ndata))


Out[346]:
1298702

In [ ]: