In [26]:

    
import pandas as pd, json, numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
%matplotlib inline

Load airports of each country



In [27]:

    
L=json.loads(file('../json/L.json','r').read())
M=json.loads(file('../json/M.json','r').read())
N=json.loads(file('../json/N.json','r').read())



In [28]:

    
import requests



In [29]:

    
AP={}
for c in M:
    if c not in AP:AP[c]={}
    for i in range(len(L[c])):
        AP[c][N[c][i]]=L[c][i]

record schedules for 2 weeks, then augment count with weekly flight numbers. seasonal and seasonal charter will count as once per week for 3 months, so 12/52 per week. TGM separate, since its history is in the past.

parse Departures



In [30]:

    
baseurl='https://www.airportia.com/'
import requests, urllib2



In [31]:

    
def urlgetter(url):
    s = requests.Session()
    cookiesopen = s.get(url)
    cookies=str(s.cookies)
    fcookies=[[k[:k.find('=')],k[k.find('=')+1:k.find(' for ')]] for k in cookies[cookies.find('Cookie '):].split('Cookie ')[1:]]
    #push token
    opener = urllib2.build_opener()
    for k in fcookies:
        opener.addheaders.append(('Cookie', k[0]+'='+k[1]))
    #read html
    return s.get(url).content

good dates



In [32]:

    
SD={}
SC=json.loads(file('../json/SC2.json','r').read())



In [ ]:

    
#pop out last - if applicable
try: SD.pop(c)
except: pass
for h in range(len(AP.keys())):
    c=AP.keys()[h]
    #country not parsed yet
    if c in SC:
        if c not in SD:
            SD[c]=[]
            print h,c
            airportialinks=AP[c]
            sch={}
            #all airports of country, where there is traffic
            for i in airportialinks:
                if i in SC[c]:
                    print i,
                    if i not in sch:sch[i]={}
                    url=baseurl+airportialinks[i]
                    m=urlgetter(url)
                    for d in range (3,17):
                        #date not parsed yet
                        if d not in sch[i]:
                            url=baseurl+airportialinks[i]+'arrivals/201704'+str(d)
                            m=urlgetter(url)
                            soup = BeautifulSoup(m, "lxml")
                            #if there are flights at all
                            if len(soup.findAll('table'))>0:
                                sch[i][d]=pd.read_html(m)[0] 
                            else: print '--W-',d,
            SD[c]=sch
            print









    



0 Canada
YUY YUX --W- 8 --W- 15 YUL XBE --W- 9 --W- 16 YUB YUD YRT

Save



In [14]:

    
dbpath='E:/Dropbox/Public/datarepo/aviation/' #large file db path
file(dbpath+"json/SD_arrv.json",'w').write(repr(SD))



In [18]:

    
cnc_path='../../universal/countries/'
cnc=pd.read_excel(cnc_path+'cnc.xlsx').set_index('Name')



In [23]:

    
MDF=pd.DataFrame()



In [24]:

    
for c in SD:
    sch=SD[c]
    mdf=pd.DataFrame()
    for i in sch:
        for d in sch[i]:
            df=sch[i][d].drop(sch[i][d].columns[3:],axis=1).drop(sch[i][d].columns[0],axis=1)
            df['To']=i
            df['Date']=d
            mdf=pd.concat([mdf,df])
    mdf=mdf.replace('Hahn','Frankfurt')
    mdf=mdf.replace('Hahn HHN','Frankfurt HHN')
    if len(sch)>0:
        mdf['City']=[i[:i.rfind(' ')] for i in mdf['From']]
        mdf['Airport']=[i[i.rfind(' ')+1:] for i in mdf['From']]
        cpath=str(cnc.T.loc[c]['ISO2']).lower()
        if cpath=='nan':cpath='na'
        file('../countries/'+cpath+"/json/mdf_arrv.json",'w').write(json.dumps(mdf.reset_index().to_json()))
        MDF=pd.concat([MDF,mdf])
        print c,









    



Canada Libyan Arab Jamahiriya Guernsey Turkmenistan Lithuania FYR of Macedonia Cambodia Dem. Rep. of Congo Ethiopia Aruba Argentina Bolivia Cameroon Burkina Faso Ghana Saudi Arabia Cape Verde Slovenia Guatemala Bosnia and Herzegovina Guinea Russian Federation Germany Dominica Liberia Maldives Paraguay Pakistan Oman Tanzania Greenland Gabon Niue Monaco New Zealand Yemen Jersey Jamaica Albania Samoa United Arab Emirates Uruguay India Azerbaijan Madagascar Lesotho Saint Vincent and the Grenadines Kenya Tajikistan Turkey Afghanistan Fiji Bangladesh Eritrea Solomon Islands Saint Lucia Mongolia France Syrian Arab Republic Bermuda Slovakia Somalia Peru Vanuatu Nauru Norway Malawi Cook Islands Benin Cuba Montenegro Saint Kitts and Nevis Togo China Armenia Antigua and Barbuda Dominican Republic Ukraine Bahrain Tonga Finland Western Sahara Indonesia Mauritius Sweden Vietnam British Virgin Islands Guyana Mali Bulgaria United States Romania Angola Cayman Islands South Africa Cyprus Brunei Darussalam Malaysia Austria Mozambique Uganda Hungary Niger Isle of Man Brazil Kuwait Panama Rep. of Moldova Costa Rica Luxembourg Bahamas Gibraltar Ireland Italy Nigeria Ecuador Czech Republic Australia Iran Algeria El Salvador Tuvalu Marshall Islands Chile Puerto Rico Belgium Kiribati Haiti Belize Hong Kong Sierra Leone Georgia Gambia Philippines Portugal Morocco Namibia Guinea-Bissau Thailand Switzerland Grenada Seychelles Chad Estonia Kosovo Equatorial Guinea Lebanon Uzbekistan Egypt Djibouti Rwanda Timor-Leste Spain Colombia Burundi Taiwan Turks and Caicos Islands Barbados Qatar Palau Bhutan Sudan Nepal São Tomé and Principe Malta Netherlands Suriname Anguilla Venezuela Micronesia (Federated States of) Israel Myanmar (Burma) Iceland Zambia Senegal Papua New Guinea Cote d'Ivoire Lao People's Dem. Rep. Zimbabwe Jordan Denmark Kazakhstan Poland Cent African Rep Mauritania Kyrgyzstan Iraq Montserrat Trinidad and Tobago Latvia People's Republic of Korea South Sudan Japan Croatia Belarus Honduras Mexico Tunisia Nicaragua Singapore Serbia Comoros United Kingdom Congo Greece Sri Lanka French Guiana Rep. of Korea Botswana



In [25]:

    
dbpath='E:/Dropbox/Public/datarepo/aviation/' #large file db path
MDF.reset_index().to_json(dbpath+'json/MDF_arrv.json')