In [3]:
import pandas
import arrow
from time import strptime
import datetime

In [7]:
def tendata(dirc, filename):
    matdat = pandas.read_csv(dirc + filename)
    matdat.drop(matdat.tail(2).index,inplace=True) # drop last n rows
    timelist = list()

    for wrtime in matdat.Tournament_Date:
        wrspi = (wrtime.split('-'))
        #print('20' +str(wrspi[2]) + '-' + str(strptime(wrspi[1],'%b').tm_mon) + '-' + str(wrspi[0]))
        yearnow = int('20' +str(wrspi[2]))
        #print(yearnow)
        mthnow = int(strptime(wrspi[1],'%b').tm_mon)
        #print(mthnow)
        daynow = int(wrspi[0])
        #print(daynow)

        #arnow = arrow.get(str(yearnow) + '-' + str(mthnow) + '-' + str(daynow), 'YYYY M DD')
        arfin = arrow.get(yearnow, mthnow, daynow)
        #arnows = arrow.get(wrtime, 'YYYY MM DD')
        #print(arnows)
        #print(arfin.date)
        timelist.append(arfin.datetime)
        #print(strptime(wrspi[1],'%b').tm_mon)
        #print('20' +str(wrspi[2]))
    n = matdat.columns[3]

    matdat.drop(n, axis = 1, inplace = True)
    
    df1 = matdat.assign(date = timelist)
    df1.to_csv('{}'.format(filename), index_label=False)

In [8]:
tendata('/mnt/c/Users/luke/Downloads/datathon/Datathon/', 'ATP_matches.csv')


/mnt/c/Users/luke/Documents/pycon3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:3189: DtypeWarning: Columns (11,12,13,14,17,18,24,25,26,27,30,31) have mixed types. Specify dtype option on import or set low_memory=False.
  if (yield from self.run_code(code, result)):

In [ ]:


In [9]:
tendata('/mnt/c/Users/luke/Downloads/datathon/Datathon/', 'WTA_matches.csv')

In [10]:
dateten = pandas.read_csv('/mnt/c/Users/luke/Documents/test123.csv', index_col='date')

In [11]:
surfacechoice = set(dateten['Court_Surface'])

In [12]:
surfacechoice


Out[12]:
{'Carpet', 'Clay', 'Grass', 'Hard', 'Indoor Hard'}

In [14]:
sampfil = pandas.read_csv('/mnt/c/Users/luke/Downloads/datathon/Datathon/women_dummy_submission_file.csv', encoding='latin-1')

In [60]:
windict = dict()

In [ ]:


In [62]:
for sampf in sampfil.values:
    #print(sampf[0])
    #print(sampf[1])
    playerstat = winlosecheck(sampf[0], sampf[1])
    
    totgame = playerstat['playerwins'] + playerstat['playerlosers']
    try:
        windict.update({'player_1_win_probability' : 100 * float(playerstat['playerwins'])//float(totgame), 'player1' : sampf[0],
            'player2' : sampf[1]})
    except ZeroDivisionError:
        windict.update({'player_1_win_probability' : 'nan', 'player1' : sampf[0],
            'player2' : sampf[1]})


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-62-9aa770784f0c> in <module>
      2     #print(sampf[0])
      3     #print(sampf[1])
----> 4     playerstat = winlosecheck(sampf[0], sampf[1])
      5 
      6     totgame = playerstat['playerwins'] + playerstat['playerlosers']

<ipython-input-23-f1e3a122d1b1> in winlosecheck(player, anonplayer)
      1 def winlosecheck(player, anonplayer):
      2     winch = playersearch(player, anonplayer).shape[0]
----> 3     losch = playersearch(anonplayer, player).shape[0]
      4     return({'playerwins' : winch, 'playerlosers' : losch})
      5 

<ipython-input-22-ba811b2f46a2> in playersearch(player, anonplayer)
      1 def playersearch(player, anonplayer):
----> 2     winfrm = winnersearch(player)
      3     return(winfrm[winfrm['Loser'].str.match(anonplayer)])

<ipython-input-20-0a790d108ca2> in winnersearch(nameofwinner)
      1 def winnersearch(nameofwinner):
----> 2     windf = (dateten[dateten['Winner'].str.match(nameofwinner)])
      3     return(windf)
      4 
      5 def losersearch(nameofloser):

/mnt/c/Users/luke/Documents/pycon3/lib/python3.5/site-packages/pandas/core/strings.py in match(self, pat, case, flags, na, as_indexer)
   2421     def match(self, pat, case=True, flags=0, na=np.nan, as_indexer=None):
   2422         result = str_match(self._data, pat, case=case, flags=flags, na=na,
-> 2423                            as_indexer=as_indexer)
   2424         return self._wrap_result(result)
   2425 

/mnt/c/Users/luke/Documents/pycon3/lib/python3.5/site-packages/pandas/core/strings.py in str_match(arr, pat, case, flags, na, as_indexer)
    752     f = lambda x: bool(regex.match(x))
    753 
--> 754     return _na_map(f, arr, na, dtype=dtype)
    755 
    756 

/mnt/c/Users/luke/Documents/pycon3/lib/python3.5/site-packages/pandas/core/strings.py in _na_map(f, arr, na_result, dtype)
    148 def _na_map(f, arr, na_result=np.nan, dtype=object):
    149     # should really _check_ for NA
--> 150     return _map(f, arr, na_mask=True, na_value=na_result, dtype=dtype)
    151 
    152 

/mnt/c/Users/luke/Documents/pycon3/lib/python3.5/site-packages/pandas/core/strings.py in _map(f, arr, na_mask, na_value, dtype)
    163         try:
    164             convert = not all(mask)
--> 165             result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert)
    166         except (TypeError, AttributeError) as e:
    167             # Reraise the exception if callable `f` got wrong number of args.

pandas/_libs/src/inference.pyx in pandas._libs.lib.map_infer_mask()

/mnt/c/Users/luke/Documents/pycon3/lib/python3.5/site-packages/pandas/core/strings.py in <lambda>(x)
    750 
    751     dtype = bool
--> 752     f = lambda x: bool(regex.match(x))
    753 
    754     return _na_map(f, arr, na, dtype=dtype)

KeyboardInterrupt: 

In [ ]:
with open('winprob.json', 'w') as winwr:
    winwr.write(json.dumps(windict))

In [ ]:


In [19]:
yrwins = dict()

In [20]:
def winnersearch(nameofwinner):
    windf = (dateten[dateten['Winner'].str.match(nameofwinner)])
    return(windf)

def losersearch(nameofloser):
    return(dateten[dateten['Loser'].str.match(nameofloser)])

def filtersurface(nameofsurface):
    return(dateten[dateten['Court_Surface'].str.match(nameofsurface)])

def playersearch(player, anonplayer):
    winfrm = winnersearch(player)
    return(winfrm[winfrm['Loser'].str.match(anonplayer)])

In [21]:
#function that takes 2 players and type of surface and returns the win chance.

In [22]:
def playersearch(player, anonplayer):
    winfrm = winnersearch(player)
    return(winfrm[winfrm['Loser'].str.match(anonplayer)])

In [23]:
def winlosecheck(player, anonplayer):
    winch = playersearch(player, anonplayer).shape[0]
    losch = playersearch(anonplayer, player).shape[0]
    return({'playerwins' : winch, 'playerlosers' : losch})

In [24]:
playerstat = winlosecheck('Ana Ivanovic', 'Serena Williams')

In [25]:
playerstat['playerwins']


Out[25]:
1

In [27]:
playerstat['playerlosers']


Out[27]:
5

In [28]:
totgame = playerstat['playerwins'] + playerstat['playerlosers']

In [29]:
totgame


Out[29]:
6

In [39]:
print(100 * float(playerstat['playerwins'])/float(totgame))


16.666666666666668

In [ ]:


In [32]:
float(totgame / playerstat['playerwins'])


Out[32]:
6.0

In [ ]:


In [ ]:


In [80]:
playersearch('Serena Williams', 'Andrea Petkovic').shape[0]


Out[80]:
4

In [78]:
playersearch('Andrea Petkovic', 'Serena Williams')


Out[78]:
Winner Loser Tournament Court_Surface Round_Description Winner_Rank Loser_Rank Retirement_Ind Winner_Sets_Won Winner_Games_Won ... Loser_DoubleFaults Loser_FirstServes_Won Loser_FirstServes_In Loser_SecondServes_Won Loser_SecondServes_In Loser_BreakPoints_Won Loser_BreakPoints Loser_ReturnPoints_Won Loser_ReturnPoints_Faced Loser_TotalPoints_Won
date

0 rows × 34 columns


In [ ]:


In [48]:
winnersearch('Serena Williams').head()


Out[48]:
Winner Loser Tournament Court_Surface Round_Description Winner_Rank Loser_Rank Retirement_Ind Winner_Sets_Won Winner_Games_Won ... Loser_DoubleFaults Loser_FirstServes_Won Loser_FirstServes_In Loser_SecondServes_Won Loser_SecondServes_In Loser_BreakPoints_Won Loser_BreakPoints Loser_ReturnPoints_Won Loser_ReturnPoints_Faced Loser_TotalPoints_Won
date
2014-01-01 00:00:00+00:00 Serena Williams Andrea Petkovic Brisbane Hard Second Round 1 43 0.0 2 12.0 ... 4 33 49 10 26 1 6 23 69 66
2014-01-01 00:00:00+00:00 Serena Williams Dominika Cibulkova Brisbane Hard Quarter-finals 1 23 0.0 2 12.0 ... 1 20 26 7 17 0 1 6 43 33
2014-01-01 00:00:00+00:00 Serena Williams Maria Sharapova Brisbane Hard Semi-finals 1 4 0.0 2 13.0 ... 8 16 35 14 29 4 5 31 64 61
2014-01-01 00:00:00+00:00 Serena Williams Victoria Azarenka Brisbane Hard Finals 1 2 0.0 2 13.0 ... 6 35 55 8 21 2 3 19 59 62
2014-01-13 00:00:00+00:00 Serena Williams Ashleigh Barty Australian Open, Melbourne Hard First Round 1 155 0.0 2 12.0 ... 4 14 28 11 27 0 0 6 34 31

5 rows × 34 columns


In [ ]:


In [49]:
filtersurface('Grass').head()


Out[49]:
Winner Loser Tournament Court_Surface Round_Description Winner_Rank Loser_Rank Retirement_Ind Winner_Sets_Won Winner_Games_Won ... Loser_DoubleFaults Loser_FirstServes_Won Loser_FirstServes_In Loser_SecondServes_Won Loser_SecondServes_In Loser_BreakPoints_Won Loser_BreakPoints Loser_ReturnPoints_Won Loser_ReturnPoints_Faced Loser_TotalPoints_Won
date
2014-06-09 00:00:00+00:00 Shuai Zhang Eleni Daniilidou Birmingham Grass First Round 36 345 0.0 2 13.0 ... 6 33 54 4 15 3 6 30 72 67
2014-06-09 00:00:00+00:00 Francesca Schiavone Jie Zheng Birmingham Grass First Round 71 90 0.0 2 12.0 ... 0 18 37 11 24 3 7 28 69 57
2014-06-09 00:00:00+00:00 Madison Keys Teliana Pereira Birmingham Grass First Round 42 91 0.0 2 12.0 ... 2 22 35 7 15 0 1 15 55 44
2014-06-09 00:00:00+00:00 Monica Puig Stefanie Voegele Birmingham Grass First Round 50 76 0.0 2 18.0 ... 0 33 46 16 34 2 4 33 90 82
2014-06-09 00:00:00+00:00 Nadiya Kichenok Sharon Fichman Birmingham Grass First Round 117 84 0.0 2 19.0 ... 7 48 70 15 40 4 8 44 111 107

5 rows × 34 columns


In [50]:
winnersearch('Qiang Wang').head()


Out[50]:
Winner Loser Tournament Court_Surface Round_Description Winner_Rank Loser_Rank Retirement_Ind Winner_Sets_Won Winner_Games_Won ... Loser_DoubleFaults Loser_FirstServes_Won Loser_FirstServes_In Loser_SecondServes_Won Loser_SecondServes_In Loser_BreakPoints_Won Loser_BreakPoints Loser_ReturnPoints_Won Loser_ReturnPoints_Faced Loser_TotalPoints_Won
date
2014-08-25 00:00:00+00:00 Qiang Wang Evgeniya Rodina U.S. Open, New York Hard Qualifying 139 217 0.0 2 16.0 ... . . . . . . . . . .
2014-08-25 00:00:00+00:00 Qiang Wang Katerina Vankova U.S. Open, New York Hard Qualifying 139 258 0.0 2 13.0 ... . . . . . . . . . .
2014-08-25 00:00:00+00:00 Qiang Wang Lara Arruabarrena U.S. Open, New York Hard Qualifying 139 75 0.0 2 13.0 ... . . . . . . . . . .
2014-08-25 00:00:00+00:00 Qiang Wang Paula Kania U.S. Open, New York Hard First Round 139 159 0.0 2 12.0 ... 7 12 24 3 17 0 6 . . 31
2015-02-16 00:00:00+00:00 Qiang Wang Kurumi Nara Dubai Hard First Round 98 42 0.0 2 18.0 ... 5 46 83 7 23 4 8 38 95 91

5 rows × 34 columns


In [51]:
losersearch('Qiang Wang').head()


Out[51]:
Winner Loser Tournament Court_Surface Round_Description Winner_Rank Loser_Rank Retirement_Ind Winner_Sets_Won Winner_Games_Won ... Loser_DoubleFaults Loser_FirstServes_Won Loser_FirstServes_In Loser_SecondServes_Won Loser_SecondServes_In Loser_BreakPoints_Won Loser_BreakPoints Loser_ReturnPoints_Won Loser_ReturnPoints_Faced Loser_TotalPoints_Won
date
2014-08-25 00:00:00+00:00 Casey Dellacqua Qiang Wang U.S. Open, New York Hard Second Round 32 139 0.0 2 16.0 ... 1 34 60 21 37 2 9 32 92 87
2014-10-06 00:00:00+00:00 Fangzhou Liu Qiang Wang Tianjin Hard First Round 273 107 0.0 2 12.0 ... 2 13 24 6 18 0 2 15 50 34
2015-01-05 00:00:00+00:00 Timea Babos Qiang Wang Shenzhen Hard First Round 96 101 0.0 2 18.0 ... 9 35 50 21 43 0 6 34 102 90
2015-01-19 00:00:00+00:00 Polona Hercog Qiang Wang Australian Open, Melbourne Hard First Round 88 103 0.0 2 15.0 ... 1 29 48 13 26 3 14 36 92 78
2015-02-09 00:00:00+00:00 Vera Zvonareva Qiang Wang Pattaya City Hard First Round 189 98 0.0 2 12.0 ... 7 12 26 2 19 3 6 24 50 38

5 rows × 34 columns


In [ ]:


In [52]:
dateten[dateten['Winner'].str.match('Timea Babos')].head()


Out[52]:
Winner Loser Tournament Court_Surface Round_Description Winner_Rank Loser_Rank Retirement_Ind Winner_Sets_Won Winner_Games_Won ... Loser_DoubleFaults Loser_FirstServes_Won Loser_FirstServes_In Loser_SecondServes_Won Loser_SecondServes_In Loser_BreakPoints_Won Loser_BreakPoints Loser_ReturnPoints_Won Loser_ReturnPoints_Faced Loser_TotalPoints_Won
date
2014-03-31 00:00:00+00:00 Timea Babos Ximena Hermoso Monterrey Hard First Round 108 454 0.0 2 12.0 ... 2 16 30 8 20 0 0 8 40 32
2014-05-26 00:00:00+00:00 Timea Babos Maria Irigoyen French Open, Paris Clay Qualifying 101 160 0.0 2 13.0 ... . . . . . . . . . .
2014-05-26 00:00:00+00:00 Timea Babos Indy De Vroome French Open, Paris Clay Qualifying 101 203 0.0 2 13.0 ... . . . . . . . . . .
2014-06-09 00:00:00+00:00 Timea Babos Madison Keys Birmingham Grass Second Round 108 42 0.0 2 12.0 ... 2 21 39 11 27 0 4 13 43 45
2014-09-08 00:00:00+00:00 Timea Babos Romina Oprandi Quebec City Indoor Hard First Round 109 149 0.0 2 15.0 ... 1 32 56 14 28 3 5 27 77 73

5 rows × 34 columns


In [53]:
for yearsw in range(2014, 2019):
    print(yearsw)
    curyear = (dateten.loc['{}-1-1 01:00:00'.format(yearsw) : '{}-12-31 23:00:00'.format(yearsw)])
    #print(curyear)
    yrwins.update({yearsw : list(curyear.Winner)})


2014
2015
2016
2017
2018

In [54]:
from time import strptime

strptime('Feb','%b').tm_mon


Out[54]:
2

In [55]:
lislose =list(dateten.Loser)

In [56]:
liswin = list(dateten.Winner)

In [57]:
liswin = list(allwiner)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-57-14c797776d66> in <module>
----> 1 liswin = list(allwiner)

NameError: name 'allwiner' is not defined

In [58]:
winset = set(liswin)

In [ ]:
winhisd = dict()

In [ ]:


In [ ]:
for wins in winset:
    #print(wins)
    totgame = liswin.count(wins) + lislose.count(wins)
    winhisd.update({'name' : wins, 'winhist' : liswin.count(wins), 'losehist' : lislose.count(wins),
          'totalgame' : totgame, 'winpercent' : (liswin.count(wins)/totgame) *100})
    #print(lislose.count(wins))

In [ ]:
with open('winhistory.json', 'w') as winh:
    winh.wrie(json.dumps(winhisd))