In [2]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, requests, urllib2
from bs4 import BeautifulSoup
%matplotlib inline
In [82]:
url='https://www.airportia.com/airports/'
s = requests.Session()
cookiesopen = s.get(url)
cookies=str(s.cookies)
fcookies=[[k[:k.find('=')],k[k.find('=')+1:k.find(' for ')]] for k in cookies[cookies.find('Cookie '):].split('Cookie ')[1:]]
#push token
opener = urllib2.build_opener()
for k in fcookies:
opener.addheaders.append(('Cookie', k[0]+'='+k[1]))
#read html
content=s.get(url).content
soup = BeautifulSoup(content, "lxml")
In [83]:
A=[]
if len(soup.findAll(attrs={'class':'textlist'}))>0:
links=soup.findAll(attrs={'class':'textlist'})[1].findAll('a')
A={str(i)[str(i).find('title')+7:str(i).find('>')-1]:str(i)[str(i).find('href')+6:str(i).find('title')-2] for i in links}
In [84]:
A
Out[84]:
Remove duplicates
In [85]:
D={}
for i in A:
if A[i] not in D:D[A[i]]=[]
D[A[i]].append(i)
for i in D:
if len(D[i])>1:
A.pop(D[i][0]);
Capture airport lists
In [86]:
L={}
M={}
N={}
F=[]
baseurl='https://www.airportia.com'
In [87]:
for c in A:
if c not in L or F:
#capture token
url=baseurl+A[c]
s = requests.Session()
cookiesopen = s.get(url)
cookies=str(s.cookies)
fcookies=[[k[:k.find('=')],k[k.find('=')+1:k.find(' for ')]] for k in cookies[cookies.find('Cookie '):].split('Cookie ')[1:]]
#push token
opener = urllib2.build_opener()
for k in fcookies:
opener.addheaders.append(('Cookie', k[0]+'='+k[1]))
#read html
content=s.get(url).content
soup = BeautifulSoup(content, "lxml")
if len(soup.findAll(attrs={'class':'textlist'}))>0:
links=soup.findAll(attrs={'class':'textlist'})[0].findAll('a')
L[c]=[str(i)[str(i).find('href')+6:str(i).find('title')-2] for i in links]
M[c]=[str(i)[str(i).find('title')+7:str(i).find('>')-1] for i in links]
N[c]=[str(i)[str(i).find('<')-8:str(i).find('<')-5] for i in links]
print 'Success',url
else:
F.append(c)
print 'Fail',url
In [88]:
F
Out[88]:
Save links
In [89]:
import json
file('../json/L.json','w').write(json.dumps(L))
file('../json/M.json','w').write(json.dumps(M))
file('../json/N.json','w').write(json.dumps(N))
Create folder structure
In [90]:
iso2=pd.read_html('https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2')
In [91]:
I2={}
for i in iso2[2].T.set_index(0).iteritems():
country=i[1][1]
code=i[1][0]
if str(code)=='nan':code=u'NA'
I2[country]=code
In [92]:
cnc_path='../../universal/countries/'
cnc=pd.read_excel(cnc_path+'cnc.xlsx').set_index('Name')
In [93]:
I3={}
err=[]
for i in L:
if i in I2:
I3[i]=I2[i]
elif i in cnc.columns:
I3[i]=cnc.loc['ISO2'][i]
else: err.append(i)
In [94]:
err
Out[94]:
Manual fix conflicts
In [95]:
E2={
'Saint Helena':'SH',
'Wallis And Futuna':'WF',
'Palestinian Territory':'PS',
'Saint Kitts And Nevis':'KN',
'South Korea':'KR',
'Laos':'LA',
'Antigua And Barbuda':'AG',
'Russia':'RU',
'Caribbean Netherlands':'BQ',
'Syria':'SY',
'Brunei':'BN',
'Moldova':'MD',
"Cote D'ivoire (Ivory Coast)":'CI',
'U.S. Virgin Islands':'VI',
'Reunion':'RE',
'Curacao':'CW',
'Micronesia':'FM',
'Bosnia And Herzegovina':'BA',
'S\xc3\xa3o Tom\xc3\xa9 and Principe':'ST',
'Macedonia':'MK',
'North Korea':'KP',
'Per\xc3\xba':'PE',
'Burma':'MM',
'Trinidad And Tobago':'TT'
}
In [100]:
for i in E2:
I3[i]=E2[i]
In [118]:
I3['Kosovo']='XK'
In [104]:
len(I3)==len(A)
Out[104]:
In [119]:
file('../json/I3.json','w').write(json.dumps(I3))
Save
In [109]:
pd.DataFrame(I3,index=['ISO2']).to_csv(cnc_path+'cnc.csv')
#then manually edit, create legible
In [112]:
cnc=pd.read_csv(cnc_path+'cnc.csv').set_index('index')
cnc
Out[112]:
Create folder structure
In [24]:
import os.path
In [120]:
for i in I3:
directory='../countries/'+I3[i].lower()
if not os.path.exists(directory) :
os.makedirs(directory)
for j in ['code','d3','json','map']:
if not os.path.exists(directory+'/'+j):
os.makedirs(directory+'/'+j)