In [1]:
import pandas as pd, json, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
url='http://en.wikipedia.org/wiki/List_of_airports_in_Jordan'
df=pd.read_html(url)
df=df[0].loc[:3].T.set_index(0).T.loc[1:].set_index('IATA')

In [3]:
df


Out[3]:
Location ICAO Airport Name
IATA
AMM Amman OJAI Queen Alia International Airport
ADJ Amman OJAM Amman Civil Airport
AQJ Aqaba OJAQ King Hussein International Airport

In [4]:
from pygeocoder import Geocoder
apik='AIzaSyDybC2OroTE_XDJTuxjKruxFpby5VDhEGk'

In [5]:
locations={}
for i in df.index:
    results = Geocoder(apik).geocode(i+' airport Jordan')
    locations[i]=results[0].coordinates
    print i


AMM
ADJ
AQJ

In [7]:
locations.pop('ADJ')


Out[7]:
(31.9748491, 35.9832471)

In [8]:
file("locations_jo.json",'w').write(json.dumps(locations))

In [9]:
locations=json.loads(file('locations_jo.json','r').read())

In [10]:
import requests

In [11]:
airportialinks={}
for i in locations:
    print i,
    url='https://cse.google.com/cse?cx=partner-pub-6479063288582225%3A8064105798&cof=FORID%3A10&ie=UTF-8&q='+str(i)+'+airport+jordan'
    m=requests.get(url).content
    z=pd.read_html(m)[5][0][0]
    z=z[z.find('http'):]
    airportialinks[i]=z
    print z


AMM https://www.airportia.com/jordan/queen-alia-international-airport
AQJ https://www.airportia.com/jordan/aqaba-king...airport/arrivals

In [12]:
#reformat
for z in airportialinks:
    airportialinks[z]=airportialinks[z].split('arrivals')[0].split('departures')[0].replace(' ','').replace('...','-international-')
    if airportialinks[z][-1]!='/':airportialinks[z]+='/' 
    #manual fixes
    if z=='AQJ':airportialinks[z]=u'https://www.airportia.com/jordan/aqaba-king-hussein-international-airport/'
    print airportialinks[z]


https://www.airportia.com/jordan/queen-alia-international-airport/
https://www.airportia.com/jordan/aqaba-king-hussein-international-airport/

In [13]:
sch={}

record schedules for 2 weeks, then augment count with weekly flight numbers. seasonal and seasonal charter will count as once per week for 3 months, so 12/52 per week. TGM separate, since its history is in the past.


In [14]:
for i in locations:
    print i
    if i not in sch:sch[i]={}
    #march 11-24 = 2 weeks
    for d in range (11,25):
        if d not in sch[i]:
            try:
                url=airportialinks[i]
                full=url+'departures/201703'+str(d)
                m=requests.get(full).content
                sch[i][full]=pd.read_html(m)[0]
                #print full
            except: pass #print 'no tables',i,d


AMM
AQJ

In [15]:
for i in range(11,25):
    testurl=u'https://www.airportia.com/jordan/queen-alia-international-airport/departures/201703'+str(i)
    print 'nr. of flights on March',i,':',len(sch['AMM'][testurl])
testurl=u'https://www.airportia.com/jordan/queen-alia-international-airport/departures/20170318'
k=sch['AMM'][testurl]
k[k['To']=='Frankfurt FRA']


nr. of flights on March 11 : 98
nr. of flights on March 12 : 95
nr. of flights on March 13 : 83
nr. of flights on March 14 : 91
nr. of flights on March 15 : 92
nr. of flights on March 16 : 94
nr. of flights on March 17 : 93
nr. of flights on March 18 : 97
nr. of flights on March 19 : 97
nr. of flights on March 20 : 90
nr. of flights on March 21 : 85
nr. of flights on March 22 : 92
nr. of flights on March 23 : 93
nr. of flights on March 24 : 97
Out[15]:
Flight To Airline Scheduled Departure Status Unnamed: 6
7 LH693 Frankfurt FRA Lufthansa 02:20 02:27 Landed Track >
43 RJ125 Frankfurt FRA Royal Jordanian 11:20 11:33 Landed Track >

sch checks out with source


In [16]:
mdf=pd.DataFrame()

In [17]:
for i in sch:
    for d in sch[i]:
        df=sch[i][d].drop(sch[i][d].columns[3:],axis=1).drop(sch[i][d].columns[0],axis=1)
        df['From']=i
        df['Date']=d
        mdf=pd.concat([mdf,df])

In [18]:
mdf['City']=[i[:i.rfind(' ')] for i in mdf['To']]
mdf['Airport']=[i[i.rfind(' ')+1:] for i in mdf['To']]

In [19]:
k=mdf[mdf['Date']==testurl]
k[k['To']=='Frankfurt FRA']


Out[19]:
To Airline From Date City Airport
7 Frankfurt FRA Lufthansa AMM https://www.airportia.com/jordan/queen-alia-in... Frankfurt FRA
43 Frankfurt FRA Royal Jordanian AMM https://www.airportia.com/jordan/queen-alia-in... Frankfurt FRA

mdf checks out with source


In [20]:
file("mdf_jo_dest.json",'w').write(json.dumps(mdf.reset_index().to_json()))

In [21]:
len(mdf)


Out[21]:
1341

In [22]:
airlines=set(mdf['Airline'])

In [23]:
cities=set(mdf['City'])

In [24]:
file("cities_jo_dest.json",'w').write(json.dumps(list(cities)))
file("airlines_jo_dest.json",'w').write(json.dumps(list(airlines)))

In [25]:
citycoords={}

In [26]:
for i in cities:
    if i not in citycoords:
        if i==u'Birmingham': z='Birmingham, UK'
        elif i==u'Valencia': z='Valencia, Spain'
        elif i==u'Naples': z='Naples, Italy'
        elif i==u'St. Petersburg': z='St. Petersburg, Russia'
        elif i==u'Bristol': z='Bristol, UK'
        elif i==u'Beida': z='Bayda, Libya'
        else: z=i
        citycoords[i]=Geocoder(apik).geocode(z)
        print i


Basra
Kiev
Istanbul
Arbil
Paris
Athens
Tripoli
Assiut
Beida
Munich
Amman
Beirut
Muscat
Vienna
Bahrain
Ankara
Dammam
Geneva
Larnaca
Abu Dhabi
Moscow
Chicago
Khartoum
Medina
Bangkok
Aqaba
Baghdad
Sulaimaniyah
Brussels
Hong Kong
Jeddah
Sohag
Benghazi
Dubai
Kuwait City
Berlin
Barcelona
Rome
Tunis
Stockholm
Sharjah
Tel Aviv
Frankfurt
Montreal
Bucharest
Alexandria
Doha
Riyadh
Cairo
Zurich
Algiers
Helsinki
Maastricht
New York
London
Madrid
Sharm el-Sheikh
Amsterdam
Copenhagen
Najaf

In [27]:
citysave={}
for i in citycoords:
    citysave[i]={"coords":citycoords[i][0].coordinates,
                 "country":citycoords[i][0].country}

In [28]:
file("citysave_jo_dest.json",'w').write(json.dumps(citysave))