In [1]:
import pandas as pd
import numpy as np
import io
import requests

In [2]:
year = 2019

In [15]:
url = 'https://r:o@url_kuzhet'

In [4]:
s= requests.get(url).content
u = io.StringIO(s.decode('utf-8'))

dfo = pd.read_csv(u, 
                  index_col=False,
                  names = ["Date", "Format", "Name", "Location", "Place", "Kpr","Prize","Country","Start-Time","Continent","City-Time","Latitude","Longitude","Address1","Hashtag","Adress2","Slots","Pro","C","D","URL"],
                  dtype = {'Latitude': np.float64, 'Longitude': np.float64}, 
                  encoding = 'utf8')
dfo.head()
df = dfo[['Date', 'Format', 'Name', 'Location', 'Country', 'Start-Time','Continent', 'City-Time', 'Latitude','Longitude','Hashtag']]
df = df[df['Date'].str.contains(str(year))]

In [5]:
df.head(3)


Out[5]:
Date Format Name Location Country Start-Time Continent City-Time Latitude Longitude Hashtag
752 2019-01-13 half 70.3 Pucon pucon CHL 08:00:00 America Santiago -39.270416 -71.978283 #pucon703
753 2019-01-27 half 70.3 South Africa south-africa RSA 07:00:00 Africa Johannesburg -33.029158 27.854587 #IM703SA
754 2019-02-01 half 70.3 Dubai dubai ARE 07:00:00 Asia Dubai 25.204849 55.270783 #IM703Dubai

In [6]:
import urllib.request
import time

def ping(url):
    time.sleep(5)
    try:
        contents = urllib.request.urlopen(url).read()
        return True
    except Exception:
        print("not fecthed")
        return False

In [7]:
# bug if official web site
#let's remove it
df = df[df.Name != "IRONMAN 70.3 World Championship - Women's Results"]

In [8]:
from pytz import timezone
from datetime import datetime, timedelta

#from pytz import timezone
ContinentCityZone = df.Continent+'/'+df['City-Time']
df['TzInfo'] = [timezone(x) for x in ContinentCityZone]
df['DateStartTime'] = df.Date +' '+df['Start-Time']


df['Timestamp'] = [pd.Timestamp(x) for x in df.DateStartTime]

for index, row in df.iterrows():
    df.set_value(index, 'GMT', row.Timestamp.tz_localize(row.TzInfo))
    
df = df.drop(['TzInfo', 'DateStartTime', 'Timestamp'], axis=1)


/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:13: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead
  del sys.path[0]

In [9]:
df[df.Continent== 'Europe'].sort_values(by="GMT").tail(3)


Out[9]:
Date Format Name Location Country Start-Time Continent City-Time Latitude Longitude Hashtag GMT
883 2019-09-29 half 70.3 Portugal cascais PRT 07:30:00 Europe Lisbon 38.696661 -9.419599 #IMI703Portugal 2019-09-29 07:30:00+01:00
885 2019-10-06 iron Barcelona barcelona ESP 08:08:00 Europe Madrid 41.610325 2.646578 #IMBarcelona 2019-10-06 08:08:00+02:00
898 2019-10-28 half 70.3 Turkey turkey TUR 07:25:00 Europe Istanbul 36.836766 31.108628 #IM703Turkey 2019-10-28 07:25:00+03:00

In [10]:
df = df.drop(['GMT'], axis=1)

In [ ]:


In [11]:
#"http://www.ironman.com/triathlon/events/americas/ironman-70.3/pucon.aspx"
#"http://eu.ironman.com/triathlon/events/emea/ironman/south-africa.aspx"


df['Link'] = 'N/A'


prefix = 'http://www.ironman.com/triathlon/events/'
suffix = '.aspx'

for index, row in df.iterrows():
    url = 'N/A'
    continent = 'N/A'
    
    #print(row.Name)
    
    emea_lat_min = 25.0
    emea_lat_max = 55.0
    emea_long_min = 12.0
    emea_long_max = 75.0
    
    ## Needed for Arabic countries like Dubaï and Kasaksthan
    if ((row.Latitude >= emea_lat_min) and 
        (row.Latitude <= emea_lat_max) and 
        (row.Longitude >= emea_long_min) and 
        (row.Longitude <= emea_long_max)):
        continent = 'emea'
    ## needed for Hawaii which is in Pacific
    elif row.Country == 'USA':
        continent = 'americas'    
    elif row.Continent == 'Africa':
        continent = 'emea'
    elif row.Continent == 'America':
        continent = 'americas'
    elif row.Continent == 'Atlantic':
        continent = 'emea'
    elif row.Continent == 'Asia':
        continent = 'asiapac'
    elif row.Continent == 'Australia':
        continent = 'asiapac'
    elif row.Continent == 'Europe':
        continent = 'emea'
    elif row.Continent == 'Pacific':
        continent = 'asiapac'  
    
    # 70.3 races
    if row.Format == 'half':
        middle = '/ironman-70.3/'
        if row.Location == '70.3-world-championship-mens-race':
            row.Location = '70.3-world-championship'
            continent = 'americas'
        elif row.Location == '70.3-world-championship-womens-race':
            row.Location = '70.3-world-championship'
            continent = 'americas'
        
    # 140.6 races
    elif row.Format == 'iron':
        middle = '/ironman/'
        
    else :
        print('!!! unknown format')
        
    url = prefix + continent + middle + row.Location + suffix
    
    if continent == '/NA!!!/':
        print(url)
    
    # put next flag to False as often as possible in order not to burden official web-site
    do_we_want_to_ping_url = False
    if do_we_want_to_ping_url:
        if ping(url) == True:
            print("%d ok for %s (%s)" % (index, row.Name, row.Date))
        else:
            print("%d KO for %s !!" % (index, url))
    #else:
        #print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
    
    # debug to check if some 70.3 event have a name that does not start with 70.3
    #if row.Format == 'half':
    #    if row.Name.startswith('70') == False:
    #        print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
    
    df.set_value(index, 'Link', url)
    
    #if index >= 1200:
    #    break
    #"http://www.ironman.com/triathlon/events/americas/ironman-70.3/pucon.aspx"


/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:83: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead

In [12]:
df.head()


Out[12]:
Date Format Name Location Country Start-Time Continent City-Time Latitude Longitude Hashtag Link
752 2019-01-13 half 70.3 Pucon pucon CHL 08:00:00 America Santiago -39.270416 -71.978283 #pucon703 http://www.ironman.com/triathlon/events/americ...
753 2019-01-27 half 70.3 South Africa south-africa RSA 07:00:00 Africa Johannesburg -33.029158 27.854587 #IM703SA http://www.ironman.com/triathlon/events/emea/i...
754 2019-02-01 half 70.3 Dubai dubai ARE 07:00:00 Asia Dubai 25.204849 55.270783 #IM703Dubai http://www.ironman.com/triathlon/events/emea/i...
755 2019-02-17 half 70.3 Geelong geelong AUS 07:00:00 Australia Hobart -38.146312 144.369272 #IM703Geelong http://www.ironman.com/triathlon/events/asiapa...
759 2019-02-24 half 70.3 Colombo colombo LKA 06:15:00 Asia Colombo 6.929042 79.842438 NaN http://www.ironman.com/triathlon/events/asiapa...

In [13]:
df.to_csv(path_or_buf='races-'+str(year)+'.csv', encoding = 'utf8', index=False)

In [ ]:


In [14]:
#def_add_link
df['Link'] = 'N/A'


prefix = 'http://www.ironman.com/triathlon/events/'
suffix = '.aspx'

for index, row in df.iterrows():
    url = 'N/A'
    continent = 'N/A'
    
    #print(row.Name)
    
    emea_lat_min = 25.0
    emea_lat_max = 55.0
    emea_long_min = 12.0
    emea_long_max = 75.0
    
    ## Needed for Arabic countries like Dubaï and Kasaksthan
    if ((row.Latitude >= emea_lat_min) and 
        (row.Latitude <= emea_lat_max) and 
        (row.Longitude >= emea_long_min) and 
        (row.Longitude <= emea_long_max)):
        continent = 'emea'
    ## needed for Hawaii which is in Pacific
    elif row.Country == 'USA':
        continent = 'americas'    
    elif row.Continent == 'Africa':
        continent = 'emea'
    elif row.Continent == 'America':
        continent = 'americas'
    elif row.Continent == 'Atlantic':
        continent = 'emea'
    elif row.Continent == 'Asia':
        continent = 'asiapac'
    elif row.Continent == 'Australia':
        continent = 'asiapac'
    elif row.Continent == 'Europe':
        continent = 'emea'
    elif row.Continent == 'Pacific':
        continent = 'asiapac'  
    
    # 70.3 races
    if row.Format == 'half':
        middle = '/ironman-70.3/'
        if row.Location == '70.3-world-championship-mens-race':
            row.Location = '70.3-world-championship'
            continent = 'americas'
        elif row.Location == '70.3-world-championship-womens-race':
            row.Location = '70.3-world-championship'
            continent = 'americas'
        
    # 140.6 races
    elif row.Format == 'iron':
        middle = '/ironman/'
        
    else :
        print('!!! unknown format')
        
    url = prefix + continent + middle + row.Location + suffix
    
    if continent == '/NA!!!/':
        print(url)
    
    # put next flag to False as often as possible in order not to burden official web-site
    do_we_want_to_ping_url = False
    if do_we_want_to_ping_url:
        if ping(url) == True:
            print("%d ok for %s (%s)" % (index, row.Name, row.Date))
        else:
            print("%d KO for %s !!" % (index, url))
    #else:
        #print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
    
    # debug to check if some 70.3 event have a name that does not start with 70.3
    #if row.Format == 'half':
    #    if row.Name.startswith('70') == False:
    #        print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
    
    df.set_value(index, 'Link', url)


/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:80: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead

In [ ]: