In [1]:
import pandas as pd
import numpy as np
import io
import requests
In [2]:
year = 2019
In [15]:
url = 'https://r:o@url_kuzhet'
In [4]:
s= requests.get(url).content
u = io.StringIO(s.decode('utf-8'))
dfo = pd.read_csv(u,
index_col=False,
names = ["Date", "Format", "Name", "Location", "Place", "Kpr","Prize","Country","Start-Time","Continent","City-Time","Latitude","Longitude","Address1","Hashtag","Adress2","Slots","Pro","C","D","URL"],
dtype = {'Latitude': np.float64, 'Longitude': np.float64},
encoding = 'utf8')
dfo.head()
df = dfo[['Date', 'Format', 'Name', 'Location', 'Country', 'Start-Time','Continent', 'City-Time', 'Latitude','Longitude','Hashtag']]
df = df[df['Date'].str.contains(str(year))]
In [5]:
df.head(3)
Out[5]:
In [6]:
import urllib.request
import time
def ping(url):
time.sleep(5)
try:
contents = urllib.request.urlopen(url).read()
return True
except Exception:
print("not fecthed")
return False
In [7]:
# bug if official web site
#let's remove it
df = df[df.Name != "IRONMAN 70.3 World Championship - Women's Results"]
In [8]:
from pytz import timezone
from datetime import datetime, timedelta
#from pytz import timezone
ContinentCityZone = df.Continent+'/'+df['City-Time']
df['TzInfo'] = [timezone(x) for x in ContinentCityZone]
df['DateStartTime'] = df.Date +' '+df['Start-Time']
df['Timestamp'] = [pd.Timestamp(x) for x in df.DateStartTime]
for index, row in df.iterrows():
df.set_value(index, 'GMT', row.Timestamp.tz_localize(row.TzInfo))
df = df.drop(['TzInfo', 'DateStartTime', 'Timestamp'], axis=1)
In [9]:
df[df.Continent== 'Europe'].sort_values(by="GMT").tail(3)
Out[9]:
In [10]:
df = df.drop(['GMT'], axis=1)
In [ ]:
In [11]:
#"http://www.ironman.com/triathlon/events/americas/ironman-70.3/pucon.aspx"
#"http://eu.ironman.com/triathlon/events/emea/ironman/south-africa.aspx"
df['Link'] = 'N/A'
prefix = 'http://www.ironman.com/triathlon/events/'
suffix = '.aspx'
for index, row in df.iterrows():
url = 'N/A'
continent = 'N/A'
#print(row.Name)
emea_lat_min = 25.0
emea_lat_max = 55.0
emea_long_min = 12.0
emea_long_max = 75.0
## Needed for Arabic countries like Dubaï and Kasaksthan
if ((row.Latitude >= emea_lat_min) and
(row.Latitude <= emea_lat_max) and
(row.Longitude >= emea_long_min) and
(row.Longitude <= emea_long_max)):
continent = 'emea'
## needed for Hawaii which is in Pacific
elif row.Country == 'USA':
continent = 'americas'
elif row.Continent == 'Africa':
continent = 'emea'
elif row.Continent == 'America':
continent = 'americas'
elif row.Continent == 'Atlantic':
continent = 'emea'
elif row.Continent == 'Asia':
continent = 'asiapac'
elif row.Continent == 'Australia':
continent = 'asiapac'
elif row.Continent == 'Europe':
continent = 'emea'
elif row.Continent == 'Pacific':
continent = 'asiapac'
# 70.3 races
if row.Format == 'half':
middle = '/ironman-70.3/'
if row.Location == '70.3-world-championship-mens-race':
row.Location = '70.3-world-championship'
continent = 'americas'
elif row.Location == '70.3-world-championship-womens-race':
row.Location = '70.3-world-championship'
continent = 'americas'
# 140.6 races
elif row.Format == 'iron':
middle = '/ironman/'
else :
print('!!! unknown format')
url = prefix + continent + middle + row.Location + suffix
if continent == '/NA!!!/':
print(url)
# put next flag to False as often as possible in order not to burden official web-site
do_we_want_to_ping_url = False
if do_we_want_to_ping_url:
if ping(url) == True:
print("%d ok for %s (%s)" % (index, row.Name, row.Date))
else:
print("%d KO for %s !!" % (index, url))
#else:
#print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
# debug to check if some 70.3 event have a name that does not start with 70.3
#if row.Format == 'half':
# if row.Name.startswith('70') == False:
# print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
df.set_value(index, 'Link', url)
#if index >= 1200:
# break
#"http://www.ironman.com/triathlon/events/americas/ironman-70.3/pucon.aspx"
In [12]:
df.head()
Out[12]:
In [13]:
df.to_csv(path_or_buf='races-'+str(year)+'.csv', encoding = 'utf8', index=False)
In [ ]:
In [14]:
#def_add_link
df['Link'] = 'N/A'
prefix = 'http://www.ironman.com/triathlon/events/'
suffix = '.aspx'
for index, row in df.iterrows():
url = 'N/A'
continent = 'N/A'
#print(row.Name)
emea_lat_min = 25.0
emea_lat_max = 55.0
emea_long_min = 12.0
emea_long_max = 75.0
## Needed for Arabic countries like Dubaï and Kasaksthan
if ((row.Latitude >= emea_lat_min) and
(row.Latitude <= emea_lat_max) and
(row.Longitude >= emea_long_min) and
(row.Longitude <= emea_long_max)):
continent = 'emea'
## needed for Hawaii which is in Pacific
elif row.Country == 'USA':
continent = 'americas'
elif row.Continent == 'Africa':
continent = 'emea'
elif row.Continent == 'America':
continent = 'americas'
elif row.Continent == 'Atlantic':
continent = 'emea'
elif row.Continent == 'Asia':
continent = 'asiapac'
elif row.Continent == 'Australia':
continent = 'asiapac'
elif row.Continent == 'Europe':
continent = 'emea'
elif row.Continent == 'Pacific':
continent = 'asiapac'
# 70.3 races
if row.Format == 'half':
middle = '/ironman-70.3/'
if row.Location == '70.3-world-championship-mens-race':
row.Location = '70.3-world-championship'
continent = 'americas'
elif row.Location == '70.3-world-championship-womens-race':
row.Location = '70.3-world-championship'
continent = 'americas'
# 140.6 races
elif row.Format == 'iron':
middle = '/ironman/'
else :
print('!!! unknown format')
url = prefix + continent + middle + row.Location + suffix
if continent == '/NA!!!/':
print(url)
# put next flag to False as often as possible in order not to burden official web-site
do_we_want_to_ping_url = False
if do_we_want_to_ping_url:
if ping(url) == True:
print("%d ok for %s (%s)" % (index, row.Name, row.Date))
else:
print("%d KO for %s !!" % (index, url))
#else:
#print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
# debug to check if some 70.3 event have a name that does not start with 70.3
#if row.Format == 'half':
# if row.Name.startswith('70') == False:
# print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
df.set_value(index, 'Link', url)
In [ ]: