In [1]:
import pandas as pd, json, numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
%matplotlib inline

Load airports of each country


In [2]:
L=json.loads(file('../json/L.json','r').read())
M=json.loads(file('../json/M.json','r').read())
N=json.loads(file('../json/N.json','r').read())

In [3]:
import requests

In [4]:
AP={}
for c in M:
    if c not in AP:AP[c]={}
    for i in range(len(L[c])):
        AP[c][N[c][i]]=L[c][i]

record schedules for 2 weeks, then augment count with weekly flight numbers. seasonal and seasonal charter will count as once per week for 3 months, so 12/52 per week. TGM separate, since its history is in the past.

parse Departures


In [5]:
baseurl='https://www.airportia.com/'
import requests, urllib2

In [6]:
def urlgetter(url):
    s = requests.Session()
    cookiesopen = s.get(url)
    cookies=str(s.cookies)
    fcookies=[[k[:k.find('=')],k[k.find('=')+1:k.find(' for ')]] for k in cookies[cookies.find('Cookie '):].split('Cookie ')[1:]]
    #push token
    opener = urllib2.build_opener()
    for k in fcookies:
        opener.addheaders.append(('Cookie', k[0]+'='+k[1]))
    #read html
    return s.get(url).content

good dates


In [8]:
SD={}
SC=json.loads(file('../json/SC2.json','r').read())

In [9]:
for h in range(2,5):#len(AP.keys())):
    c=AP.keys()[h]
    #country not parsed yet
    if c in SC:
        if c not in SD:
            SD[c]=[]
            print h,c
            airportialinks=AP[c]
            sch={}
            #all airports of country, where there is traffic
            for i in airportialinks:
                if i in SC[c]:
                    print i,
                    if i not in sch:sch[i]={}
                    url=baseurl+airportialinks[i]
                    m=urlgetter(url)
                    for d in range (3,7):
                        #date not parsed yet
                        if d not in sch[i]:
                            url=baseurl+airportialinks[i]+'departures/201704'+str(d)
                            m=urlgetter(url)
                            soup = BeautifulSoup(m, "lxml")
                            #if there are flights at all
                            if len(soup.findAll('table'))>0:
                                sch[i][d]=pd.read_html(m)[0] 
                            else: print '--W-',d,
            SD[c]=sch
            print


2 Turkmenistan
CRZ ASB KRW TAZ MYP
3 Lithuania
VNO KUN PLQ
4 FYR of Macedonia
OHD --W- 5 SKP

Save


In [12]:
cnc_path='../../universal/countries/'
cnc=pd.read_excel(cnc_path+'cnc.xlsx').set_index('Name')

In [13]:
MDF=pd.DataFrame()

In [14]:
for c in SD:
    sch=SD[c]
    mdf=pd.DataFrame()
    for i in sch:
        for d in sch[i]:
            df=sch[i][d].drop(sch[i][d].columns[3:],axis=1).drop(sch[i][d].columns[0],axis=1)
            df['From']=i
            df['Date']=d
            mdf=pd.concat([mdf,df])
    mdf=mdf.replace('Hahn','Frankfurt')
    mdf=mdf.replace('Hahn HHN','Frankfurt HHN')
    mdf['City']=[i[:i.rfind(' ')] for i in mdf['To']]
    mdf['Airport']=[i[i.rfind(' ')+1:] for i in mdf['To']]
    file('../countries/'+cnc.T.loc[c]['ISO2'].lower()+"/json/mdf_dest.json",'w').write(json.dumps(mdf.reset_index().to_json()))
    MDF=pd.concat([MDF,mdf])

In [18]:
MDF.reset_index().to_json('../json/MDF.json')