notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import io
import requests



In [2]:

    
year = 2019



In [15]:

    
url = 'https://r:o@url_kuzhet'



In [4]:

    
s= requests.get(url).content
u = io.StringIO(s.decode('utf-8'))

dfo = pd.read_csv(u, 
                  index_col=False,
                  names = ["Date", "Format", "Name", "Location", "Place", "Kpr","Prize","Country","Start-Time","Continent","City-Time","Latitude","Longitude","Address1","Hashtag","Adress2","Slots","Pro","C","D","URL"],
                  dtype = {'Latitude': np.float64, 'Longitude': np.float64}, 
                  encoding = 'utf8')
dfo.head()
df = dfo[['Date', 'Format', 'Name', 'Location', 'Country', 'Start-Time','Continent', 'City-Time', 'Latitude','Longitude','Hashtag']]
df = df[df['Date'].str.contains(str(year))]



In [5]:

    
df.head(3)









    Out[5]:







  
    
      
      Date
      Format
      Name
      Location
      Country
      Start-Time
      Continent
      City-Time
      Latitude
      Longitude
      Hashtag
    
  
  
    
      752
      2019-01-13
      half
      70.3 Pucon
      pucon
      CHL
      08:00:00
      America
      Santiago
      -39.270416
      -71.978283
      #pucon703
    
    
      753
      2019-01-27
      half
      70.3 South Africa
      south-africa
      RSA
      07:00:00
      Africa
      Johannesburg
      -33.029158
      27.854587
      #IM703SA
    
    
      754
      2019-02-01
      half
      70.3 Dubai
      dubai
      ARE
      07:00:00
      Asia
      Dubai
      25.204849
      55.270783
      #IM703Dubai



In [6]:

    
import urllib.request
import time

def ping(url):
    time.sleep(5)
    try:
        contents = urllib.request.urlopen(url).read()
        return True
    except Exception:
        print("not fecthed")
        return False



In [7]:

    
# bug if official web site
#let's remove it
df = df[df.Name != "IRONMAN 70.3 World Championship - Women's Results"]



In [8]:

    
from pytz import timezone
from datetime import datetime, timedelta

#from pytz import timezone
ContinentCityZone = df.Continent+'/'+df['City-Time']
df['TzInfo'] = [timezone(x) for x in ContinentCityZone]
df['DateStartTime'] = df.Date +' '+df['Start-Time']


df['Timestamp'] = [pd.Timestamp(x) for x in df.DateStartTime]

for index, row in df.iterrows():
    df.set_value(index, 'GMT', row.Timestamp.tz_localize(row.TzInfo))
    
df = df.drop(['TzInfo', 'DateStartTime', 'Timestamp'], axis=1)









    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:13: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead
  del sys.path[0]



In [9]:

    
df[df.Continent== 'Europe'].sort_values(by="GMT").tail(3)









    Out[9]:







  
    
      
      Date
      Format
      Name
      Location
      Country
      Start-Time
      Continent
      City-Time
      Latitude
      Longitude
      Hashtag
      GMT
    
  
  
    
      883
      2019-09-29
      half
      70.3 Portugal
      cascais
      PRT
      07:30:00
      Europe
      Lisbon
      38.696661
      -9.419599
      #IMI703Portugal
      2019-09-29 07:30:00+01:00
    
    
      885
      2019-10-06
      iron
      Barcelona
      barcelona
      ESP
      08:08:00
      Europe
      Madrid
      41.610325
      2.646578
      #IMBarcelona
      2019-10-06 08:08:00+02:00
    
    
      898
      2019-10-28
      half
      70.3 Turkey
      turkey
      TUR
      07:25:00
      Europe
      Istanbul
      36.836766
      31.108628
      #IM703Turkey
      2019-10-28 07:25:00+03:00



In [10]:

    
df = df.drop(['GMT'], axis=1)



In [ ]:



In [11]:

    
#"http://www.ironman.com/triathlon/events/americas/ironman-70.3/pucon.aspx"
#"http://eu.ironman.com/triathlon/events/emea/ironman/south-africa.aspx"


df['Link'] = 'N/A'


prefix = 'http://www.ironman.com/triathlon/events/'
suffix = '.aspx'

for index, row in df.iterrows():
    url = 'N/A'
    continent = 'N/A'
    
    #print(row.Name)
    
    emea_lat_min = 25.0
    emea_lat_max = 55.0
    emea_long_min = 12.0
    emea_long_max = 75.0
    
    ## Needed for Arabic countries like Dubaï and Kasaksthan
    if ((row.Latitude >= emea_lat_min) and 
        (row.Latitude <= emea_lat_max) and 
        (row.Longitude >= emea_long_min) and 
        (row.Longitude <= emea_long_max)):
        continent = 'emea'
    ## needed for Hawaii which is in Pacific
    elif row.Country == 'USA':
        continent = 'americas'    
    elif row.Continent == 'Africa':
        continent = 'emea'
    elif row.Continent == 'America':
        continent = 'americas'
    elif row.Continent == 'Atlantic':
        continent = 'emea'
    elif row.Continent == 'Asia':
        continent = 'asiapac'
    elif row.Continent == 'Australia':
        continent = 'asiapac'
    elif row.Continent == 'Europe':
        continent = 'emea'
    elif row.Continent == 'Pacific':
        continent = 'asiapac'  
    
    # 70.3 races
    if row.Format == 'half':
        middle = '/ironman-70.3/'
        if row.Location == '70.3-world-championship-mens-race':
            row.Location = '70.3-world-championship'
            continent = 'americas'
        elif row.Location == '70.3-world-championship-womens-race':
            row.Location = '70.3-world-championship'
            continent = 'americas'
        
    # 140.6 races
    elif row.Format == 'iron':
        middle = '/ironman/'
        
    else :
        print('!!! unknown format')
        
    url = prefix + continent + middle + row.Location + suffix
    
    if continent == '/NA!!!/':
        print(url)
    
    # put next flag to False as often as possible in order not to burden official web-site
    do_we_want_to_ping_url = False
    if do_we_want_to_ping_url:
        if ping(url) == True:
            print("%d ok for %s (%s)" % (index, row.Name, row.Date))
        else:
            print("%d KO for %s !!" % (index, url))
    #else:
        #print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
    
    # debug to check if some 70.3 event have a name that does not start with 70.3
    #if row.Format == 'half':
    #    if row.Name.startswith('70') == False:
    #        print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
    
    df.set_value(index, 'Link', url)
    
    #if index >= 1200:
    #    break
    #"http://www.ironman.com/triathlon/events/americas/ironman-70.3/pucon.aspx"









    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:83: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead



In [12]:

    
df.head()









    Out[12]:







  
    
      
      Date
      Format
      Name
      Location
      Country
      Start-Time
      Continent
      City-Time
      Latitude
      Longitude
      Hashtag
      Link
    
  
  
    
      752
      2019-01-13
      half
      70.3 Pucon
      pucon
      CHL
      08:00:00
      America
      Santiago
      -39.270416
      -71.978283
      #pucon703
      http://www.ironman.com/triathlon/events/americ...
    
    
      753
      2019-01-27
      half
      70.3 South Africa
      south-africa
      RSA
      07:00:00
      Africa
      Johannesburg
      -33.029158
      27.854587
      #IM703SA
      http://www.ironman.com/triathlon/events/emea/i...
    
    
      754
      2019-02-01
      half
      70.3 Dubai
      dubai
      ARE
      07:00:00
      Asia
      Dubai
      25.204849
      55.270783
      #IM703Dubai
      http://www.ironman.com/triathlon/events/emea/i...
    
    
      755
      2019-02-17
      half
      70.3 Geelong
      geelong
      AUS
      07:00:00
      Australia
      Hobart
      -38.146312
      144.369272
      #IM703Geelong
      http://www.ironman.com/triathlon/events/asiapa...
    
    
      759
      2019-02-24
      half
      70.3 Colombo
      colombo
      LKA
      06:15:00
      Asia
      Colombo
      6.929042
      79.842438
      NaN
      http://www.ironman.com/triathlon/events/asiapa...



In [13]:

    
df.to_csv(path_or_buf='races-'+str(year)+'.csv', encoding = 'utf8', index=False)



In [ ]:



In [14]:

    
#def_add_link
df['Link'] = 'N/A'


prefix = 'http://www.ironman.com/triathlon/events/'
suffix = '.aspx'

for index, row in df.iterrows():
    url = 'N/A'
    continent = 'N/A'
    
    #print(row.Name)
    
    emea_lat_min = 25.0
    emea_lat_max = 55.0
    emea_long_min = 12.0
    emea_long_max = 75.0
    
    ## Needed for Arabic countries like Dubaï and Kasaksthan
    if ((row.Latitude >= emea_lat_min) and 
        (row.Latitude <= emea_lat_max) and 
        (row.Longitude >= emea_long_min) and 
        (row.Longitude <= emea_long_max)):
        continent = 'emea'
    ## needed for Hawaii which is in Pacific
    elif row.Country == 'USA':
        continent = 'americas'    
    elif row.Continent == 'Africa':
        continent = 'emea'
    elif row.Continent == 'America':
        continent = 'americas'
    elif row.Continent == 'Atlantic':
        continent = 'emea'
    elif row.Continent == 'Asia':
        continent = 'asiapac'
    elif row.Continent == 'Australia':
        continent = 'asiapac'
    elif row.Continent == 'Europe':
        continent = 'emea'
    elif row.Continent == 'Pacific':
        continent = 'asiapac'  
    
    # 70.3 races
    if row.Format == 'half':
        middle = '/ironman-70.3/'
        if row.Location == '70.3-world-championship-mens-race':
            row.Location = '70.3-world-championship'
            continent = 'americas'
        elif row.Location == '70.3-world-championship-womens-race':
            row.Location = '70.3-world-championship'
            continent = 'americas'
        
    # 140.6 races
    elif row.Format == 'iron':
        middle = '/ironman/'
        
    else :
        print('!!! unknown format')
        
    url = prefix + continent + middle + row.Location + suffix
    
    if continent == '/NA!!!/':
        print(url)
    
    # put next flag to False as often as possible in order not to burden official web-site
    do_we_want_to_ping_url = False
    if do_we_want_to_ping_url:
        if ping(url) == True:
            print("%d ok for %s (%s)" % (index, row.Name, row.Date))
        else:
            print("%d KO for %s !!" % (index, url))
    #else:
        #print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
    
    # debug to check if some 70.3 event have a name that does not start with 70.3
    #if row.Format == 'half':
    #    if row.Name.startswith('70') == False:
    #        print("%d ok for %s (%s) url=%s" % (index, row.Name, row.Date, url))
    
    df.set_value(index, 'Link', url)









    



/usr/local/lib/python3.5/dist-packages/ipykernel_launcher.py:80: FutureWarning: set_value is deprecated and will be removed in a future release. Please use .at[] or .iat[] accessors instead



In [ ]:

	Date	Format	Name	Location	Country	Start-Time	Continent	City-Time	Latitude	Longitude	Hashtag
752	2019-01-13	half	70.3 Pucon	pucon	CHL	08:00:00	America	Santiago	-39.270416	-71.978283	#pucon703
753	2019-01-27	half	70.3 South Africa	south-africa	RSA	07:00:00	Africa	Johannesburg	-33.029158	27.854587	#IM703SA
754	2019-02-01	half	70.3 Dubai	dubai	ARE	07:00:00	Asia	Dubai	25.204849	55.270783	#IM703Dubai

	Date	Format	Name	Location	Country	Start-Time	Continent	City-Time	Latitude	Longitude	Hashtag	GMT
883	2019-09-29	half	70.3 Portugal	cascais	PRT	07:30:00	Europe	Lisbon	38.696661	-9.419599	#IMI703Portugal	2019-09-29 07:30:00+01:00
885	2019-10-06	iron	Barcelona	barcelona	ESP	08:08:00	Europe	Madrid	41.610325	2.646578	#IMBarcelona	2019-10-06 08:08:00+02:00
898	2019-10-28	half	70.3 Turkey	turkey	TUR	07:25:00	Europe	Istanbul	36.836766	31.108628	#IM703Turkey	2019-10-28 07:25:00+03:00