In [31]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt, requests, urllib2
from bs4 import BeautifulSoup
%matplotlib inline
Import CNC
In [107]:
cnc_path='../../universal/countries/'
cnc=pd.read_excel(cnc_path+'cnc.xlsx').set_index('Name')
In [101]:
cnc
Out[101]:
Capture airport lists
In [65]:
L={}
F=[]
baseurl='https://www.airportia.com/'
In [85]:
for k in range(len(cnc.columns)):
c=cnc.columns[k]
if c not in L and F:
#capture token
url=baseurl+c.lower().replace(' ','-')
s = requests.Session()
cookiesopen = s.get(url)
cookies=str(s.cookies)
fcookies=[[k[:k.find('=')],k[k.find('=')+1:k.find(' for ')]] for k in cookies[cookies.find('Cookie '):].split('Cookie ')[1:]]
#push token
opener = urllib2.build_opener()
for k in fcookies:
opener.addheaders.append(('Cookie', k[0]+'='+k[1]))
#read html
content=s.get(url).content
soup = BeautifulSoup(content, "lxml")
if len(soup.findAll(attrs={'class':'textlist'}))>0:
links=soup.findAll(attrs={'class':'textlist'})[0].findAll('a')
L[c]=[str(i)[str(i).find('href')+6:str(i).find('title')-2] for i in links]
print 'Success',url
else:
F.append(c)
print 'Fail',url
Fix failures
In [111]:
lnc={
"Brunei Darussalam":"brunei",
"Cent African Rep":"central-african-republic",
"Cote d'Ivoire":"ivory-coast",
"People's Republic of Korea":"north-korea",
"Dem. Rep. of Congo":"congo",
"Guinea-Bissau":"guinea_bissau",
"Holy See":"Holy See",
"Lao People's Dem. Rep.":"laos",
"Libyan Arab Jamahiriya":"libya",
"Liechtenstein":"Liechtenstein",
"Macao, China":"Macao, China",
"Micronesia (Federated States of)":"micronesia",
"Myanmar (Burma)":"burma",
"Netherlands Antilles":"Netherlands Antilles",
"Palestinian Territories":"palestinian-territory",
"Peru":u"perú",
"Rep. of Korea":"south-korea",
"Rep. of Moldova":"moldova",
"Russian Federation":"russia",
"San Marino":"San Marino",
"Syrian Arab Republic":"syria",
"FYR of Macedonia":"macedonia",
"Timor-Leste":"timor_leste",
"Tokelau":"Tokelau",
"Zanzibar":"Zanzibar",
"Svalbard and Jan Mayen":"Svalbard and Jan Mayen",
"US Virgin Islands":"US Virgin Islands"
}
In [69]:
F2=[]
In [79]:
for k in range(len(cnc.columns)):
c=cnc.columns[k]
if c not in L and F2:
#capture token
url=baseurl+lnc[c]
s = requests.Session()
cookiesopen = s.get(url)
cookies=str(s.cookies)
fcookies=[[k[:k.find('=')],k[k.find('=')+1:k.find(' for ')]] for k in cookies[cookies.find('Cookie '):].split('Cookie ')[1:]]
#push token
opener = urllib2.build_opener()
for k in fcookies:
opener.addheaders.append(('Cookie', k[0]+'='+k[1]))
#read html
content=s.get(url).content
soup = BeautifulSoup(content, "lxml")
if len(soup.findAll(attrs={'class':'textlist'}))>0:
links=soup.findAll(attrs={'class':'textlist'})[0].findAll('a')
L[c]=[str(i)[str(i).find('href')+6:str(i).find('title')-2] for i in links]
print 'Success',url
else:
F2.append(c)
print 'Fail',url
In [89]:
#absolute failures
F2
Out[89]:
Update CNC
In [113]:
g=[]
for k in range(len(cnc.columns)):
c=cnc.columns[k]
if c in F2:
g.append(np.nan)
elif c in F:
g.append(lnc[c])
else:
g.append(c.lower().replace(' ','-'))
In [114]:
dnc=cnc.T
dnc['Airportia']=g
cnc=dnc.T
In [116]:
cnc
Out[116]:
In [118]:
cnc.to_excel(cnc_path+'cnc1.xlsx')
Save links
In [120]:
import json
file('../json/L.json','w').write(json.dumps(L))
Create folder structure
In [125]:
import os.path
In [131]:
for i in cnc.loc['ISO2']:
if str(i).lower()!='nan':
directory='../countries/'+i.lower()
if not os.path.exists(directory) :
os.makedirs(directory)
for j in ['code','d3','json','map']:
if not os.path.exists(directory+'/'+j):
os.makedirs(directory+'/'+j)