In [1]:
import pandas as pd, numpy as np

In [2]:
url='https://en.wikipedia.org/wiki/List_of_International_Space_Station_expeditions'

In [9]:
import bs4
import requests
r=requests.get(url)
soup = bs4.BeautifulSoup(r.content)
tables=soup.findAll("table")

In [32]:
ppls={}
for trs in [tables[0].findAll("tr"),tables[1].findAll("tr")]:
    for i,tr in enumerate(trs):
        if i>0:
            tds=tr.findAll("td")
            for j,td in enumerate(tds):
                aas=td.findAll("a")
                for a in aas:
                    if a:
                        txt=a.text
                        if txt:
                            if '[' not in txt:
                                if txt not in ppls: 
                                    if j==0:
                                        ppls[txt]=a['href']

In [34]:
df=pd.read_html(url)
df=pd.concat(df[:2]).reset_index()

In [35]:
def find_names(s,ppls,z):
    nms=s.split(' ')
    l=2
    while l<4:
        ppl=' '.join(nms[:l])
        if ppl in ppls:
            z.append(ppl)
            rest=' '.join(nms[l:])
            find_names(rest,ppls,z)
            l=4
        l+=1
    return z

In [36]:
dgs=[]
for i in df.index:
    crew=df.loc[i]['Crew'].replace('\n','')
    crews=find_names(crew,ppls,[])
    for c in crews:
        dg=df.loc[[i]][['Expedition','Duration(days)']].copy()
        dg['Crew']=c
        date=df.loc[i]['Launch date']
        date=date.split(' ')
        if ',' in date[1]:
            date=date[1].replace(',','')+' '+date[0]+' '+date[2][:4]
        else:
            date=date[0]+' '+date[1]+' '+date[2][:4]
        dg['Date']=date
        dgs.append(dg)

In [37]:
dgs=pd.concat(dgs).reset_index()
dhs=dgs.set_index(['Expedition','Crew'])

In [38]:
def get_duration(duration):
    default='160'
    duration=str(duration)
    if duration=='nan':
        return default
    if 'ransfer' in duration:
            duration=' '.join(duration.split(' ')[-2:])
            try:
                duration=dhs.loc[duration].loc[crew]['Duration(days)']
            except:
                print(crew,duration)
                return default
    return duration

In [39]:
data=[]
for i in dgs.index:
    crew=dgs.loc[i]['Crew']
    date=dgs.loc[i]['Date']
    if 'ransfer' not in date:
        duration=dgs.loc[i]['Duration(days)']
        duration=get_duration(duration)
        duration=get_duration(duration)
        duration=get_duration(duration)
        if '[' in duration:
            duration=duration[:duration.find('[')]
        duration=int(np.round(float(duration.replace('days','').replace('day','').strip()),0))
        data.append({'Crew':crew,'Date':date,'Duration':duration})


Mikhail Korniyenko year mission
Scott J. Kelly year mission
Yuri Malenchenko Expedition 47
Timothy Peake Expedition 47
Aleksey Ovchinin Expedition 60
Christina Koch Expedition 60
Nick Hague Expedition 60

In [40]:
data=pd.DataFrame(data)

In [41]:
data.head()


Out[41]:
Crew Date Duration
0 Yuri Malenchenko 26 April 2003 185
1 Gennady Padalka 19 April 2004 186
2 Pavel Vinogradov 30 March 2006 183
3 Jeffrey N. Williams 30 March 2006 183
4 Thomas Reiter 4 July 2006 171

In [42]:
links={}
country_map={}
for trs in [tables[0].findAll("tr"),tables[1].findAll("tr")]:
    for i,tr in enumerate(trs):
        if i>0:
            aas=tr.findAll("a")
            for j,a in enumerate(aas):
                if a:
                    txt=a.text
                    if txt:
                        if '[' not in txt:
                            links[txt]=a['href']
                            if txt in data['Crew'].values:
                                if txt not in country_map:
                                    country=aas[j-1].find('img')['alt']
                                    country_map[txt]=country
                    else:
                        if j>1:
                            country=a.find('img')['alt']
                            links[country]=a.find('img')['src']

In [43]:
countries=pd.DataFrame(country_map,index=['Country']).T
links=pd.DataFrame(links,index=['Link']).T

In [44]:
data=data.join(countries,on='Crew')
data['Crew_link']=data.join(links,on='Crew')['Link']
data['Country_link']=data.join(links,on='Country')['Link']

In [45]:
data.to_csv('data.csv')

In [46]:
ndata={}
for i in data.index:
    start=pd.to_datetime(data.loc[i]['Date'])
    periods=data.loc[i]['Duration']
    crew=data.loc[i]['Crew']
    country=data.loc[i]['Country']
    for idate in pd.date_range(start,periods=periods,freq='1D'):
        date=str(idate)[:10]
        if date not in ndata: ndata[date]={}
        if country not in ndata[date]: ndata[date][country]=0
        ndata[date][country]+=1

In [47]:
pd.DataFrame(ndata).to_csv('ndata.csv')

In [48]:
import json

In [49]:
open('ndata.json','w').write(json.dumps(ndata))


Out[49]:
288546

In [50]:
ndata2=[]
for i in data.index:
    start=pd.to_datetime(data.loc[i]['Date'])
    periods=data.loc[i]['Duration']
    crew=data.loc[i]['Crew']
    country=data.loc[i]['Country']
    for idate in pd.date_range(start,periods=periods,freq='1D'):
        date=str(idate)[:10]
        ndata2.append({'Date':date,'Name':crew,'Country':country,'Crew':1})

In [51]:
open('ndata2.json','w').write(json.dumps(ndata2))


Out[51]:
1868353

In [52]:
open('ndata2a.json','w').write(json.dumps(ndata2[:1000]))


Out[52]:
86130

In [53]:
pd.DataFrame(ndata).T.to_csv('ndataT.csv')

In [54]:
df=pd.DataFrame(ndata).T.fillna(0)
df.index.name='Date'
df=df.reset_index()

In [55]:
edata={}
for c in df.columns:
    if c!='Date':
        edata[c]=list(df[c].values.astype(int).astype(str))
    else:
        edata[c]=list(df[c].values)

In [56]:
edata.keys()


Out[56]:
dict_keys(['Date', 'Belgium', 'Canada', 'France', 'Germany', 'Italy', 'Japan', 'Netherlands', 'Russia', 'United Kingdom', 'United States'])

In [57]:
json.dump(edata,open('edata.json','w'))

In [ ]: