In [1]:
import pandas as pd, json, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
url='http://en.wikipedia.org/wiki/List_of_airports_in_Romania'
df=pd.read_html(url)
df=df[0].loc[:17].T.set_index(0).T.loc[2:].set_index('IATA')
In [3]:
df
Out[3]:
In [1]:
from pygeocoder import Geocoder
apik='AIzaSyDybC2OroTE_XDJTuxjKruxFpby5VDhEGk'
In [ ]:
results = Geocoder(apik).geocode(i+' airport romania')
In [5]:
locations={}
for i in df.index:
results = Geocoder(apik).geocode(i+' airport romania')
locations[i]=results[0].coordinates
print i
In [6]:
file("locations_ro.json",'w').write(json.dumps(locations))
In [7]:
locations=json.loads(file('locations_ro.json','r').read())
In [8]:
import requests
In [9]:
airportialinks={}
for i in locations:
print i,
url='https://cse.google.com/cse?cx=partner-pub-6479063288582225%3A8064105798&cof=FORID%3A10&ie=UTF-8&q='+str(i)+'+airport+romania'
m=requests.get(url).content
z=pd.read_html(m)[5][0][0]
z=z[z.find('http'):]
airportialinks[i]=z
print z
In [10]:
#reformat
for z in airportialinks:
airportialinks[z]=airportialinks[z].split('arrivals')[0].split('departures')[0].replace(' ','').replace('...','-international-')
if airportialinks[z][-1]!='/':airportialinks[z]+='/'
#manual fixes
if z=='TSR':airportialinks[z]='https://www.airportia.com/romania/timişoara-traian-vuia-airport/'
print airportialinks[z]
In [11]:
sch={}
record schedules for 2 weeks, then augment count with weekly flight numbers. seasonal and seasonal charter will count as once per week for 3 months, so 12/52 per week. TGM separate, since its history is in the past.
In [12]:
for i in locations:
print i
if i not in sch:sch[i]={}
if i!='TGM':
#march 11-24 = 2 weeks
for d in range (11,25):
if d not in sch[i]:
try:
url=airportialinks[i]
full=url+'departures/201703'+str(d)
m=requests.get(full).content
sch[i][full]=pd.read_html(m)[0]
#print full
except: pass #print 'no tables',i,d
else:
#november 17-30 = 2 weeks
for d in range (17,31):
if d not in sch[i]:
try:
url=airportialinks[i]
full=url+'departures/201611'+str(d)
m=requests.get(full).content
sch[i][full]=pd.read_html(m)[0]
#print full
except: pass #print 'no tables',i,d
In [13]:
mdf=pd.DataFrame()
In [14]:
for i in sch:
for d in sch[i]:
df=sch[i][d].drop(sch[i][d].columns[3:],axis=1).drop(sch[i][d].columns[0],axis=1)
df['From']=i
df['Date']=d
mdf=pd.concat([mdf,df])
In [15]:
mdf=mdf.replace('Hahn','Frankfurt')
mdf=mdf.replace('Hahn HHN','Frankfurt HHN')
In [16]:
mdf['City']=[i[:i.rfind(' ')] for i in mdf['To']]
mdf['Airport']=[i[i.rfind(' ')+1:] for i in mdf['To']]
In [17]:
file("mdf_ro_dest.json",'w').write(json.dumps(mdf.reset_index().to_json()))
In [25]:
len(mdf)
Out[25]:
In [18]:
airlines=set(mdf['Airline'])
In [19]:
cities=set(mdf['City'])
In [20]:
file("cities_ro_dest.json",'w').write(json.dumps(list(cities)))
file("airlines_ro_dest.json",'w').write(json.dumps(list(airlines)))
In [26]:
citycoords={}
In [27]:
for i in cities:
if i not in citycoords:
if i==u'Birmingham': z='Birmingham, UK'
elif i==u'Valencia': z='Valencia, Spain'
elif i==u'Naples': z='Naples, Italy'
elif i==u'St. Petersburg': z='St. Petersburg, Russia'
elif i==u'Bristol': z='Bristol, UK'
else: z=i
citycoords[i]=Geocoder(apik).geocode(z)
print i
In [28]:
citysave={}
for i in citycoords:
citysave[i]={"coords":citycoords[i][0].coordinates,
"country":citycoords[i][0].country}
In [29]:
file("citysave_ro_dest.json",'w').write(json.dumps(citysave))