In [1]:
import pandas as pd, json, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
url='http://en.wikipedia.org/wiki/List_of_airports_in_Jordan'
df=pd.read_html(url)
df=df[0].loc[:3].T.set_index(0).T.loc[1:].set_index('IATA')

In [3]:
df


Out[3]:
Location ICAO Airport Name
IATA
AMM Amman OJAI Queen Alia International Airport
ADJ Amman OJAM Amman Civil Airport
AQJ Aqaba OJAQ King Hussein International Airport

In [4]:
from pygeocoder import Geocoder
apik='AIzaSyDybC2OroTE_XDJTuxjKruxFpby5VDhEGk'

In [5]:
locations={}
for i in df.index:
    results = Geocoder(apik).geocode(i+' airport Jordan')
    locations[i]=results[0].coordinates
    print i


AMM
ADJ
AQJ

In [6]:
locations.pop('ADJ')


Out[6]:
(31.9748491, 35.9832471)

In [7]:
file("locations_jo.json",'w').write(json.dumps(locations))

In [8]:
locations=json.loads(file('locations_jo.json','r').read())

In [9]:
import requests

In [10]:
airportialinks={}
for i in locations:
    print i,
    url='https://cse.google.com/cse?cx=partner-pub-6479063288582225%3A8064105798&cof=FORID%3A10&ie=UTF-8&q='+str(i)+'+airport+jordan'
    m=requests.get(url).content
    z=pd.read_html(m)[5][0][0]
    z=z[z.find('http'):]
    airportialinks[i]=z
    print z


AMM https://www.airportia.com/jordan/queen-alia-international-airport
AQJ https://www.airportia.com/jordan/aqaba-king...airport/arrivals

In [11]:
#reformat
for z in airportialinks:
    airportialinks[z]=airportialinks[z].split('arrivals')[0].split('departures')[0].replace(' ','').replace('...','-international-')
    if airportialinks[z][-1]!='/':airportialinks[z]+='/' 
    #manual fixes
    if z=='AQJ':airportialinks[z]=u'https://www.airportia.com/jordan/aqaba-king-hussein-international-airport/'
    print airportialinks[z]


https://www.airportia.com/jordan/queen-alia-international-airport/
https://www.airportia.com/jordan/aqaba-king-hussein-international-airport/

In [12]:
sch={}

record schedules for 2 weeks, then augment count with weekly flight numbers. seasonal and seasonal charter will count as once per week for 3 months, so 12/52 per week. TGM separate, since its history is in the past.


In [13]:
for i in locations:
    print i
    if i not in sch:sch[i]={}
    #march 11-24 = 2 weeks
    for d in range (11,25):
        if d not in sch[i]:
            try:
                url=airportialinks[i]
                full=url+'arrivals/201703'+str(d)
                m=requests.get(full).content
                sch[i][full]=pd.read_html(m)[0]
                #print full
            except: pass #print 'no tables',i,d


AMM
AQJ

In [14]:
for i in range(11,25):
    testurl=u'https://www.airportia.com/jordan/queen-alia-international-airport/arrivals/201703'+str(i)
    print 'nr. of flights on March',i,':',len(sch['AMM'][testurl])
testurl=u'https://www.airportia.com/jordan/queen-alia-international-airport/arrivals/20170318'
k=sch['AMM'][testurl]
k[k['From']=='Frankfurt FRA']


nr. of flights on March 11 : 99
nr. of flights on March 12 : 94
nr. of flights on March 13 : 86
nr. of flights on March 14 : 87
nr. of flights on March 15 : 92
nr. of flights on March 16 : 93
nr. of flights on March 17 : 95
nr. of flights on March 18 : 96
nr. of flights on March 19 : 94
nr. of flights on March 20 : 89
nr. of flights on March 21 : 84
nr. of flights on March 22 : 95
nr. of flights on March 23 : 91
nr. of flights on March 24 : 99
Out[14]:
Flight From Airline Scheduled Arrival Status Unnamed: 6
63 LH692 Frankfurt FRA Lufthansa 18:55 19:02 Landed Track >
79 RJ126 Frankfurt FRA Royal Jordanian 21:15 21:36 Delayed Track >

sch checks out with source


In [15]:
mdf=pd.DataFrame()

In [16]:
for i in sch:
    for d in sch[i]:
        df=sch[i][d].drop(sch[i][d].columns[3:],axis=1).drop(sch[i][d].columns[0],axis=1)
        df['To']=i
        df['Date']=d
        mdf=pd.concat([mdf,df])

In [17]:
mdf['City']=[i[:i.rfind(' ')] for i in mdf['From']]
mdf['Airport']=[i[i.rfind(' ')+1:] for i in mdf['From']]

In [18]:
k=mdf[mdf['Date']==testurl]
k[k['From']=='Frankfurt FRA']


Out[18]:
From Airline To Date City Airport
63 Frankfurt FRA Lufthansa AMM https://www.airportia.com/jordan/queen-alia-in... Frankfurt FRA
79 Frankfurt FRA Royal Jordanian AMM https://www.airportia.com/jordan/queen-alia-in... Frankfurt FRA

mdf checks out with source


In [19]:
file("mdf_jo_arrv.json",'w').write(json.dumps(mdf.reset_index().to_json()))

In [20]:
len(mdf)


Out[20]:
1336

In [21]:
airlines=set(mdf['Airline'])

In [22]:
cities=set(mdf['City'])

In [23]:
file("cities_jo_arrv.json",'w').write(json.dumps(list(cities)))
file("airlines_jo_arrv.json",'w').write(json.dumps(list(airlines)))

In [24]:
citycoords={}

In [25]:
for i in cities:
    if i not in citycoords:
        if i==u'Birmingham': z='Birmingham, UK'
        elif i==u'Valencia': z='Valencia, Spain'
        elif i==u'Naples': z='Naples, Italy'
        elif i==u'St. Petersburg': z='St. Petersburg, Russia'
        elif i==u'Bristol': z='Bristol, UK'
        elif i==u'Beida': z='Bayda, Libya'
        else: z=i
        citycoords[i]=Geocoder(apik).geocode(z)
        print i


Basra
Kiev
Istanbul
Arbil
Paris
Athens
Tripoli
Khartoum
Beida
Munich
Amman
Beirut
Muscat
Vienna
Bahrain
Bangkok
Dammam
Geneva
Larnaca
Abu Dhabi
Moscow
Chicago
Berlin
Medina
Ankara
Aqaba
Baghdad
Sulaimaniyah
Brussels
Jeddah
Sohag
Assiut
Dubai
Kuwait City
Helsinki
Barcelona
Rome
Tunis
Stockholm
Sharjah
Tel Aviv
Frankfurt
Montreal
Bucharest
Najaf
Doha
Riyadh
Cairo
Zurich
Algiers
Alexandria
Maastricht
New York
London
Madrid
Sharm el-Sheikh
Amsterdam
Copenhagen
Benghazi

In [26]:
citysave={}
for i in citycoords:
    citysave[i]={"coords":citycoords[i][0].coordinates,
                 "country":citycoords[i][0].country}

In [27]:
file("citysave_jo_arrv.json",'w').write(json.dumps(citysave))