In [26]:
import pandas as pd, json, numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
%matplotlib inline
Load airports of each country
In [27]:
L=json.loads(file('../json/L.json','r').read())
M=json.loads(file('../json/M.json','r').read())
N=json.loads(file('../json/N.json','r').read())
In [28]:
import requests
In [29]:
AP={}
for c in M:
if c not in AP:AP[c]={}
for i in range(len(L[c])):
AP[c][N[c][i]]=L[c][i]
record schedules for 2 weeks, then augment count with weekly flight numbers. seasonal and seasonal charter will count as once per week for 3 months, so 12/52 per week. TGM separate, since its history is in the past.
parse Departures
In [30]:
baseurl='https://www.airportia.com/'
import requests, urllib2
In [31]:
def urlgetter(url):
s = requests.Session()
cookiesopen = s.get(url)
cookies=str(s.cookies)
fcookies=[[k[:k.find('=')],k[k.find('=')+1:k.find(' for ')]] for k in cookies[cookies.find('Cookie '):].split('Cookie ')[1:]]
#push token
opener = urllib2.build_opener()
for k in fcookies:
opener.addheaders.append(('Cookie', k[0]+'='+k[1]))
#read html
return s.get(url).content
good dates
In [32]:
SD={}
SC=json.loads(file('../json/SC2.json','r').read())
In [ ]:
#pop out last - if applicable
try: SD.pop(c)
except: pass
for h in range(len(AP.keys())):
c=AP.keys()[h]
#country not parsed yet
if c in SC:
if c not in SD:
SD[c]=[]
print h,c
airportialinks=AP[c]
sch={}
#all airports of country, where there is traffic
for i in airportialinks:
if i in SC[c]:
print i,
if i not in sch:sch[i]={}
url=baseurl+airportialinks[i]
m=urlgetter(url)
for d in range (3,17):
#date not parsed yet
if d not in sch[i]:
url=baseurl+airportialinks[i]+'arrivals/201704'+str(d)
m=urlgetter(url)
soup = BeautifulSoup(m, "lxml")
#if there are flights at all
if len(soup.findAll('table'))>0:
sch[i][d]=pd.read_html(m)[0]
else: print '--W-',d,
SD[c]=sch
print
Save
In [14]:
dbpath='E:/Dropbox/Public/datarepo/aviation/' #large file db path
file(dbpath+"json/SD_arrv.json",'w').write(repr(SD))
In [18]:
cnc_path='../../universal/countries/'
cnc=pd.read_excel(cnc_path+'cnc.xlsx').set_index('Name')
In [23]:
MDF=pd.DataFrame()
In [24]:
for c in SD:
sch=SD[c]
mdf=pd.DataFrame()
for i in sch:
for d in sch[i]:
df=sch[i][d].drop(sch[i][d].columns[3:],axis=1).drop(sch[i][d].columns[0],axis=1)
df['To']=i
df['Date']=d
mdf=pd.concat([mdf,df])
mdf=mdf.replace('Hahn','Frankfurt')
mdf=mdf.replace('Hahn HHN','Frankfurt HHN')
if len(sch)>0:
mdf['City']=[i[:i.rfind(' ')] for i in mdf['From']]
mdf['Airport']=[i[i.rfind(' ')+1:] for i in mdf['From']]
cpath=str(cnc.T.loc[c]['ISO2']).lower()
if cpath=='nan':cpath='na'
file('../countries/'+cpath+"/json/mdf_arrv.json",'w').write(json.dumps(mdf.reset_index().to_json()))
MDF=pd.concat([MDF,mdf])
print c,
In [25]:
dbpath='E:/Dropbox/Public/datarepo/aviation/' #large file db path
MDF.reset_index().to_json(dbpath+'json/MDF_arrv.json')