In [212]:
import pandas as pd, numpy as np
In [2]:
url='https://en.wikipedia.org/wiki/List_of_International_Space_Station_expeditions'
In [321]:
df=pd.read_html(url)
df=pd.concat(df[:2]).reset_index()
In [322]:
def find_names(s,ppls,z):
nms=s.split(' ')
l=2
while l<4:
ppl=' '.join(nms[:l])
if ppl in ppls:
z.append(ppl)
rest=' '.join(nms[l:])
find_names(rest,ppls,z)
l=4
l+=1
return z
In [323]:
dgs=[]
for i in df.index:
crew=df.loc[i]['Crew'].replace('\n','')
crews=find_names(crew,ppls,[])
for c in crews:
dg=df.loc[[i]][['Expedition','Duration(days)']].copy()
dg['Crew']=c
date=df.loc[i]['Launch date']
date=date.split(' ')
if ',' in date[1]:
date=date[1].replace(',','')+' '+date[0]+' '+date[2][:4]
else:
date=date[0]+' '+date[1]+' '+date[2][:4]
dg['Date']=date
dgs.append(dg)
In [324]:
dgs=pd.concat(dgs).reset_index()
dhs=dgs.set_index(['Expedition','Crew'])
In [325]:
def get_duration(duration):
default='160'
duration=str(duration)
if duration=='nan':
return default
if 'ransfer' in duration:
duration=' '.join(duration.split(' ')[-2:])
try:
duration=dhs.loc[duration].loc[crew]['Duration(days)']
except:
print(crew,duration)
return default
return duration
In [326]:
data=[]
for i in dgs.index:
crew=dgs.loc[i]['Crew']
date=dgs.loc[i]['Date']
if 'ransfer' not in date:
duration=dgs.loc[i]['Duration(days)']
duration=get_duration(duration)
duration=get_duration(duration)
duration=get_duration(duration)
if '[' in duration:
duration=duration[:duration.find('[')]
duration=int(np.round(float(duration.replace('days','').replace('day','').strip()),0))
data.append({'Crew':crew,'Date':date,'Duration':duration})
In [327]:
data=pd.DataFrame(data)
In [328]:
data.head()
Out[328]:
In [329]:
import bs4
import requests
r=requests.get(url)
soup = bs4.BeautifulSoup(r.content)
tables=soup.findAll("table")
In [330]:
links={}
country_map={}
for trs in [tables[0].findAll("tr"),tables[1].findAll("tr")]:
for i,tr in enumerate(trs):
if i>0:
aas=tr.findAll("a")
for j,a in enumerate(aas):
if a:
txt=a.text
if txt:
if '[' not in txt:
links[txt]=a['href']
if txt in data['Crew'].values:
if txt not in country_map:
country=aas[j-1].find('img')['alt']
country_map[txt]=country
else:
if j>1:
country=a.find('img')['alt']
links[country]=a.find('img')['src']
In [331]:
countries=pd.DataFrame(country_map,index=['Country']).T
links=pd.DataFrame(links,index=['Link']).T
In [332]:
data=data.join(countries,on='Crew')
data['Crew_link']=data.join(links,on='Crew')['Link']
data['Country_link']=data.join(links,on='Country')['Link']
In [333]:
data.to_csv('data.csv')
In [342]:
ndata={}
In [343]:
for i in data.index:
start=pd.to_datetime(data.loc[i]['Date'])
periods=data.loc[i]['Duration']
crew=data.loc[i]['Crew']
country=data.loc[i]['Country']
for idate in pd.date_range(start,periods=periods,freq='1D'):
date=str(idate)[:10]
if date not in ndata: ndata[date]={}
if country not in ndata[date]: ndata[date][country]={'crew':[],'count':0}
ndata[date][country]['crew'].append(crew)
ndata[date][country]['count']+=1
In [ ]:
In [347]:
pd.DataFrame(ndata)
Out[347]:
In [345]:
import json
In [346]:
open('ndata.json','w').write(json.dumps(ndata))
Out[346]:
In [ ]: