In [1]:
import requests, pandas as pd, numpy as np
from requests import session
from bs4 import BeautifulSoup
In [2]:
url='https://www.tomtom.com/en_gb/traffic-index/ranking/'
In [3]:
response = requests.get(url)
soup = BeautifulSoup(response.content)
df=pd.read_html('<table>'+str(soup.findAll('table')[0])+'/<table>')[0]
links=soup.findAll('table')[0].findAll('a')
df['a']=['https://www.tomtom.com'+i['href'] for i in links[::2]]
In [4]:
maxs={}
In [5]:
for url2 in df['a'].values:
if url2 not in maxs:
response2 = requests.get(url2)
soup2 = BeautifulSoup(response2.content)
divs=soup2.findAll('div', {"class": "PeakCongestion__congestion"})
peak=soup2.findAll('span', {"class": "TimeWastedInPeaks__hours"})[0].text
maxs[url2]=[int(i.text[:-1]) for i in divs]+[peak]
print(url2)
In [6]:
maxdf=pd.DataFrame(maxs).T
maxdf.columns=['morning','eve','waste']
In [7]:
dc=df.set_index('a').join(maxdf).drop(['Unnamed: 5','World Rank'],axis=1).reset_index()
In [8]:
dc['Congestion Level2']=dc['Congestion Level'].str.split('%').str[0]
dc['waste2']=dc['waste'].str.split(' ').str[0]
In [44]:
dc.to_excel('dc.xlsx')
In [45]:
dc.to_csv('dc.csv')
In [46]:
!pip install pygeocoder
In [47]:
from pygeocoder import Geocoder
apikey='AIzaSyB7joM_loHFb1SYFJevWfMmBCD9VO2uykc'
In [80]:
try:
geos=pd.read_csv('dcg.csv')
geos.index=geos['City']+', '+geos['Country']
geos=geos[['lat','lon']].T.to_dict()
geos={i:tuple(geos[i].values()) for i in geos}
except:
geos={}
In [81]:
for i in dc.index[:]:
to_geo=dc.loc[i]['City']+', '+dc.loc[i]['Country']
if to_geo not in geos:
print(to_geo)
coords=Geocoder(apikey).geocode(to_geo).coordinates
geos[to_geo]=coords
In [84]:
dc['lat']=[i[0] for i in geos.values()]
dc['lon']=[i[1] for i in geos.values()]
In [85]:
dc.to_csv('dcg.csv')
In [9]:
dc=pd.read_csv('dcg.csv')
https://population.un.org/wup/Download/ -> Urban Agglomerations
In [10]:
pop2=pd.read_csv('worldcities.csv')
In [11]:
pop2
Out[11]:
In [12]:
def ccc(c,country):
if c=='Moscow region (oblast)': return 'Moscow'
if c=='Rostov-on-Don': return 'Rostov-na-Donu'
if c=='Gdansk, Gdynia and Sopot': return 'Gdansk+Gdynia'
if c=='Nizhny Novgorod': return 'Nizhniy Novgorod'
if c=="Kazan": return "Kazan'"
if c=="Birmingham-Wolverhampton": return "Birmingham+Wolverhampton"
if c=='Frankfurt am Main': return 'Frankfurt'
if c=='Reggio Calabria': return 'Reggio di Calabria'
if c=='Leeds-Bradford': return 'Leeds+Bradford'
if c=='Newcastle-Sunderland': return 'Newcastle+Sunderland'
if c=='Freiburg': return 'Freiburg im Breisgau'
if c=='Ruhr region west': return 'Duisburg+Dusseldorf+Bonn+Cologne'
if c=='Ruhr region east': return 'Essen+Dortmund+Wuppertal'
if c=='Cape Coral-Fort Myers': return 'Cape Coral+Fort Myers'
if c=='Oxnard-Thousand Oaks-Ventura': return 'Oxnard+Thousand Oaks'
if c=='Dallas-Fort Worth': return 'Dallas+Fort Worth'
if c=='Katowice urban area': return 'Katowice'
if c=='Gothenburg': return 'Goteborg'
if c=='Odessa': return 'Odesa'
if c=='Kitchener-Waterloo': return 'Kitchener'
if c=='Omaha-Council Bluffs': return 'Omaha+Council Bluffs'
if c=='Greensboro-High Point': return 'Greensboro+High Point'
if c=='Den Bosch': return "'s-Hertogenbosch"
if c=='Hull': return 'Kingston upon Hull'
if c=='Swansea': return 'Abertawe'
if c=='Newcastle':
if country=='United Kingdom': return 'Newcastle upon Tyne'
return 'Newcastle'
if c=='Seville': return 'Sevilla'
if c=='Ghent': return 'Gent'
return c
In [13]:
def cnc(c):
if c=='United States of America': return 'United States'
return c
In [14]:
pops={}
missing=[]
for i in dc.index:
city=dc.loc[i]['City']
country0=dc.loc[i]['Country']
country=cnc(country0)
if country not in pop2['country'].unique():
print(country)
pop2a=pop2[pop2['country']==country]
index=city+', '+country0
for c in ccc(city,country).split('+'):
c=ccc(c,country)
if c in ['San Sebastian']:
print(city,c)
missing.append(city+', '+country)
elif c in pop2a['city'].values:
if index not in pops:pops[index]=0
pops[index]+=pop2a.drop_duplicates('city').set_index('city').loc[c]['population']
elif c in pop2['city_ascii'].values:
if index not in pops:pops[index]=0
pops[index]+=pop2a.drop_duplicates('city_ascii').set_index('city_ascii').loc[c]['population']
elif c in pop2['admin_name'].values:
if index not in pops:pops[index]=0
pops[index]+=pop2a.drop_duplicates('admin_name').set_index('admin_name').loc[c]['population']
else:
print(city,c,index)
missing.append(index)
In [15]:
fixed={}
In [16]:
for c in missing:
i=c.split(',')
url3='http://population.city/'+i[1].lower().strip()+'/'+i[0].lower().strip()+'/'
#print(url3)
response = requests.get(url3)
soup = BeautifulSoup(response.content)
em=soup.findAll('em')
if em:
fixed[c]=float(em[0].text[:-1].replace(' ',''))
print('OK',c)
else:
print('ERROR',c)
In [17]:
pops.update(fixed)
In [18]:
#manual from Wiki
pops.update({
'Palma de Mallorca, Spain':393256.0,
'Las Palmas, Spain':384315.0,
'Reggio Emilia, Italy':172326.0,
'Padua, Italy':214000.0,
'Almere, Netherlands':207904.0,
'San Sebastian, Spain':186095.0
})
In [19]:
#manual from Wiki for unflagged
pops.update({
'Preston, United Kingdom':141251.0,
'Birmingham-Wolverhampton, United Kingdom': 1157579.0+259376.0,
'Mersin, Turkey':1038940.0
})
Manual overwrites
In [20]:
pops['New Delhi, India']=20268785.0
In [21]:
dp=pd.DataFrame(pops,index=['pop']).T
In [22]:
dc['cc']=dc['City']+', '+dc['Country']
In [23]:
dcp=dc.set_index('cc').join(dp).reset_index()
len(dcp)
Out[23]:
In [24]:
dcp[np.isnan(dcp['pop'])]
Out[24]:
In [25]:
len(dcp)
Out[25]:
In [26]:
#drop manual duplicates
dcp=dcp[~(dcp['City'].isin(['Duisburg','Dusseldorf','Bonn','Cologne']))]
len(dcp)
Out[26]:
In [27]:
dcr=dcp[dcp['Country'].isin(['Austria','Belgium','Bulgaria','Croatia','Cyprus','Czechia','Denmark',
'Estonia','Finland','France','Germany','Greece','Hungary','Ireland','Italy',
'Latvia','Lithuania','Luxembourg','Malta','Netherlands','Poland',
'Portugal','Romania','Slovakia','Slovenia','Spain','Sweden'])]\
.sort_values('Rank by filter').reset_index()
dcr['eu']=dcr.index+1
dcw=dcp.join(dcr.set_index('City')[['eu']],on='City').fillna(0)
In [28]:
dcx=dcw.set_index('cc').T.to_dict()
In [29]:
dcx['Washington, United States of America']['lat']=38.930378
dcx['Washington, United States of America']['lon']=-77.057839
dcx['Ruhr region west, Germany']['lat']=51.347795
dcx['Ruhr region west, Germany']['lon']=6.699458
dcx['Ruhr region east, Germany']['lat']=51.484191
dcx['Ruhr region east, Germany']['lon']=7.457383
dcx['Quebec, Canada']['lat']=46.799437
dcx['Quebec, Canada']['lon']=-71.264797
In [30]:
pd.DataFrame(dcx).T.to_csv('dcx.csv')