In [44]:
import pandas as pd, json, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [45]:
url='http://en.wikipedia.org/wiki/List_of_airports_in_Hungary'
df=pd.read_html(url)
df=df[0].loc[:6].T.set_index(0).T.loc[2:].set_index('IATA')
In [3]:
df
Out[3]:
In [4]:
from pygeocoder import Geocoder
apik='AIzaSyDybC2OroTE_XDJTuxjKruxFpby5VDhEGk'
In [5]:
locations={}
for i in df.index:
results = Geocoder(apik).geocode(i+' airport Hungary')
locations[i]=results[0].coordinates
print i
In [6]:
file("locations_hu.json",'w').write(json.dumps(locations))
In [7]:
locations=json.loads(file('locations_hu.json','r').read())
In [8]:
import requests
In [9]:
airportialinks={}
for i in locations:
print i,
if i=='QPJ': url='https://cse.google.com/cse?cx=partner-pub-6479063288582225%3A8064105798&cof=FORID%3A10&ie=UTF-8&q='+'PEV'+'+airport+hungary'
else: url='https://cse.google.com/cse?cx=partner-pub-6479063288582225%3A8064105798&cof=FORID%3A10&ie=UTF-8&q='+str(i)+'+airport+hungary'
m=requests.get(url).content
z=pd.read_html(m)[5][0][0]
z=z[z.find('http'):]
airportialinks[i]=z
print z
In [10]:
#reformat
for z in airportialinks:
airportialinks[z]=airportialinks[z].split('arrivals')[0].split('departures')[0].replace(' ','').replace('...','-international-')
if airportialinks[z][-1]!='/':airportialinks[z]+='/'
#manual fixes
if z=='QGY':airportialinks[z]=u'https://www.airportia.com/hungary/győr_pér-international-airport/'
print airportialinks[z]
In [65]:
sch={}
record schedules for 2 weeks, then augment count with weekly flight numbers. seasonal and seasonal charter will count as once per week for 3 months, so 12/52 per week. TGM separate, since its history is in the past.
In [66]:
for i in locations:
print i
if i not in sch:sch[i]={}
#march 11-24 = 2 weeks
for d in range (11,25):
if d not in sch[i]:
try:
url=airportialinks[i]
full=url+'arrivals/201703'+str(d)
m=requests.get(full).content
sch[i][full]=pd.read_html(m)[0]
#print full
except: pass #print 'no tables',i,d
In [67]:
for i in range(11,25):
testurl=u'https://www.airportia.com/hungary/budapest-liszt-ferenc-international-airport/arrivals/201703'+str(i)
print 'nr. of flights on March',i,':',len(sch['BUD'][testurl])
testurl=u'https://www.airportia.com/hungary/budapest-liszt-ferenc-international-airport/arrivals/20170318'
k=sch['BUD'][testurl]
k[k['From']=='Frankfurt FRA']
Out[67]:
sch
checks out with source
In [68]:
mdf=pd.DataFrame()
In [69]:
for i in sch:
for d in sch[i]:
df=sch[i][d].drop(sch[i][d].columns[3:],axis=1).drop(sch[i][d].columns[0],axis=1)
df['To']=i
df['Date']=d
mdf=pd.concat([mdf,df])
In [70]:
mdf=mdf.replace('Hahn','Frankfurt')
mdf=mdf.replace('Hahn HHN','Frankfurt HHN')
In [71]:
mdf['City']=[i[:i.rfind(' ')] for i in mdf['From']]
mdf['Airport']=[i[i.rfind(' ')+1:] for i in mdf['From']]
In [72]:
k=mdf[mdf['Date']==testurl]
k[k['From']=='Frankfurt FRA']
Out[72]:
mdf
checks out with source
In [73]:
file("mdf_hu_arrv.json",'w').write(json.dumps(mdf.reset_index().to_json()))
In [74]:
len(mdf)
Out[74]:
In [55]:
airlines=set(mdf['Airline'])
In [56]:
cities=set(mdf['City'])
In [57]:
file("cities_hu_arrv.json",'w').write(json.dumps(list(cities)))
file("airlines_hu_arrv.json",'w').write(json.dumps(list(airlines)))
In [60]:
citycoords={}
In [61]:
for i in cities:
if i not in citycoords:
if i==u'Birmingham': z='Birmingham, UK'
elif i==u'Valencia': z='Valencia, Spain'
elif i==u'Naples': z='Naples, Italy'
elif i==u'St. Petersburg': z='St. Petersburg, Russia'
elif i==u'Bristol': z='Bristol, UK'
else: z=i
citycoords[i]=Geocoder(apik).geocode(z)
print i
In [62]:
citysave={}
for i in citycoords:
citysave[i]={"coords":citycoords[i][0].coordinates,
"country":citycoords[i][0].country}
In [63]:
file("citysave_hu_arrv.json",'w').write(json.dumps(citysave))