In [1]:

    
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.









    



WBracket_SampleTourney2018.png
WCities.csv
WCities_PrelimData2018.csv
WGameCities.csv
WGameCities_PrelimData2018.csv
WNCAATourneyCompactResults.csv
WNCAATourneyCompactResults_PrelimData2018.csv
WNCAATourneyDetailedResults.csv
WNCAATourneyDetailedResults_PrelimData2018.csv
WNCAATourneySeeds.csv
WNCAATourneySeeds_SampleTourney2018.csv
WNCAATourneySlots.csv
WRegularSeasonCompactResults.csv
WRegularSeasonCompactResults_PrelimData2018.csv
WRegularSeasonDetailedResults.csv
WRegularSeasonDetailedResults_PrelimData2018.csv
WSampleSubmissionStage1.csv
WSampleSubmissionStage2.csv
WSampleSubmissionStage2_SampleTourney2018.csv
WSeasons.csv
WSeasons_SampleTourney2018.csv
WTeamSpellings.csv
WTeams.csv

First we import some datasets of interest



In [2]:

    
#the seed information
df_seeds = pd.read_csv('../input/WNCAATourneySeeds_SampleTourney2018.csv')

#tour information
df_tour = pd.read_csv('../input/WRegularSeasonCompactResults_PrelimData2018.csv')

Now we separate the winners from the losers and organize our dataset



In [3]:

    
df_seeds['seed_int'] = df_seeds['Seed'].apply( lambda x : int(x[1:3]) )

df_winseeds = df_seeds.loc[:, ['TeamID', 'Season', 'seed_int']].rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed'})
df_lossseeds = df_seeds.loc[:, ['TeamID', 'Season', 'seed_int']].rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed'})
df_dummy = pd.merge(left=df_tour, right=df_winseeds, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_lossseeds, on=['Season', 'LTeamID'])

Now we match the detailed results to the merge dataset above



In [4]:

    
df_concat['DiffSeed'] = df_concat[['LSeed', 'WSeed']].apply(lambda x : 0 if x[0] == x[1] else 1, axis = 1)

Here we get our submission info



In [5]:

    
#prepares sample submission
df_sample_sub = pd.read_csv('../input/WSampleSubmissionStage2.csv')



In [6]:

    
df_sample_sub['Season'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[0]) )
df_sample_sub['TeamID1'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[1]) )
df_sample_sub['TeamID2'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[2]) )

Training Data Creation



In [7]:

    
winners = df_concat.rename( columns = { 'WTeamID' : 'TeamID1', 
                                                       'LTeamID' : 'TeamID2',
                                                      'WScore' : 'Team1_Score',
                                                      'LScore' : 'Team2_Score'}).drop(['WSeed', 'LSeed', 'WLoc'], axis = 1)
winners['Result'] = 1.0

losers = df_concat.rename( columns = { 'WTeamID' : 'TeamID2', 
                                                       'LTeamID' : 'TeamID1',
                                                      'WScore' : 'Team2_Score',
                                                      'LScore' : 'Team1_Score'}).drop(['WSeed', 'LSeed', 'WLoc'], axis = 1)

losers['Result'] = 0.0

train = pd.concat( [winners, losers], axis = 0).reset_index(drop = True)

train['Score_Ratio'] = train['Team1_Score'] / train['Team2_Score']
train['Score_Total'] = train['Team1_Score'] + train['Team2_Score']
train['Score_Pct'] = train['Team1_Score'] / train['Score_Total']

We will only consider years relevant to our test submission



In [8]:

    
df_sample_sub['Season'].unique()









    Out[8]:





array([2018])

Now lets just look at TeamID2, or just the second team info.



In [9]:

    
train_test_inner = pd.merge( train.loc[ train['Season'].isin([2018]), : ].reset_index(drop = True), 
         df_sample_sub.drop(['ID', 'Pred'], axis = 1), 
         on = ['Season', 'TeamID1', 'TeamID2'], how = 'inner' )



In [10]:

    
train_test_inner.head()









    Out[10]:







  
    
      
      DayNum
      DiffSeed
      NumOT
      Result
      Season
      Team1_Score
      Team2_Score
      TeamID1
      TeamID2
      Score_Ratio
      Score_Total
      Score_Pct
    
  
  
    
      0
      11
      1
      0
      1.0
      2018
      83
      67
      3234
      3346
      1.238806
      150
      0.553333
    
    
      1
      12
      1
      0
      1.0
      2018
      66
      51
      3281
      3346
      1.294118
      117
      0.564103
    
    
      2
      18
      1
      0
      1.0
      2018
      95
      63
      3326
      3346
      1.507937
      158
      0.601266
    
    
      3
      40
      0
      0
      1.0
      2018
      60
      46
      3343
      3346
      1.304348
      106
      0.566038
    
    
      4
      11
      1
      0
      1.0
      2018
      68
      53
      3280
      3438
      1.283019
      121
      0.561983

From the inner join, we will create data per team id to estimate the parameters we are missing that are independent of the year. Essentially, we are trying to estimate the average behavior of the team across the year.



In [11]:

    
team1d_num_ot = train_test_inner.groupby(['Season', 'TeamID1'])['NumOT'].median().reset_index()\
.set_index('Season').rename(columns = {'NumOT' : 'NumOT1'})
team2d_num_ot = train_test_inner.groupby(['Season', 'TeamID2'])['NumOT'].median().reset_index()\
.set_index('Season').rename(columns = {'NumOT' : 'NumOT2'})

num_ot = team1d_num_ot.join(team2d_num_ot).reset_index()

#sum the number of ot calls and subtract by one to prevent overcounting
num_ot['NumOT'] = num_ot[['NumOT1', 'NumOT2']].apply(lambda x : round( x.sum() ), axis = 1 )

num_ot.head()

Here we look at the comparable statistics. For the TeamID2 column, we would consider the inverse of the ratio, and 1 minus the score attempt percentage.



In [12]:

    
team1d_score_spread = train_test_inner.groupby(['Season', 'TeamID1'])[['Score_Ratio', 'Score_Pct']].median().reset_index()\
.set_index('Season').rename(columns = {'Score_Ratio' : 'Score_Ratio1', 'Score_Pct' : 'Score_Pct1'})
team2d_score_spread = train_test_inner.groupby(['Season', 'TeamID2'])[['Score_Ratio', 'Score_Pct']].median().reset_index()\
.set_index('Season').rename(columns = {'Score_Ratio' : 'Score_Ratio2', 'Score_Pct' : 'Score_Pct2'})

score_spread = team1d_score_spread.join(team2d_score_spread).reset_index()

#geometric mean of score ratio of team 1 and inverse of team 2
score_spread['Score_Ratio'] = score_spread[['Score_Ratio1', 'Score_Ratio2']].apply(lambda x : ( x[0] * ( x[1] ** -1.0) ), axis = 1 ) ** 0.5

#harmonic mean of score pct
score_spread['Score_Pct'] = score_spread[['Score_Pct1', 'Score_Pct2']].apply(lambda x : 0.5*( x[0] ** -1.0 ) + 0.5*( 1.0 - x[1] ) ** -1.0, axis = 1 ) ** -1.0

score_spread.head()









    Out[12]:







  
    
      
      Season
      TeamID1
      Score_Ratio1
      Score_Pct1
      TeamID2
      Score_Ratio2
      Score_Pct2
      Score_Ratio
      Score_Pct
    
  
  
    
      0
      2018
      3110
      0.822194
      0.449524
      3141
      1.036585
      0.508982
      0.890604
      0.469356
    
    
      1
      2018
      3110
      0.822194
      0.449524
      3143
      1.241952
      0.552779
      0.813645
      0.448369
    
    
      2
      2018
      3110
      0.822194
      0.449524
      3163
      0.573171
      0.364341
      1.197692
      0.526628
    
    
      3
      2018
      3110
      0.822194
      0.449524
      3177
      0.870130
      0.465278
      0.972065
      0.488436
    
    
      4
      2018
      3110
      0.822194
      0.449524
      3179
      1.080000
      0.519231
      0.872520
      0.464622

Now lets create a model just solely based on the inner group and predict those probabilities.

We will get the teams with the missing result.



In [13]:

    
X_train = train_test_inner.loc[:, ['Season', 'NumOT', 'Score_Ratio', 'Score_Pct']]
train_labels = train_test_inner['Result']

train_test_outer = pd.merge( train.loc[ train['Season'].isin([2014, 2015, 2016, 2017]), : ].reset_index(drop = True), 
         df_sample_sub.drop(['ID', 'Pred'], axis = 1), 
         on = ['Season', 'TeamID1', 'TeamID2'], how = 'outer' )

train_test_outer = train_test_outer.loc[ train_test_outer['Result'].isnull(), 
                                        ['TeamID1', 'TeamID2', 'Season']]

train_test_missing = pd.merge( pd.merge( score_spread.loc[:, ['TeamID1', 'TeamID2', 'Season', 'Score_Ratio', 'Score_Pct']], 
                   train_test_outer, on = ['TeamID1', 'TeamID2', 'Season']),
         num_ot.loc[:, ['TeamID1', 'TeamID2', 'Season', 'NumOT']],
         on = ['TeamID1', 'TeamID2', 'Season'])

We scale our data for our keras classifier, and make sure our categorical variables are properly processed.



In [14]:

    
X_test = train_test_missing.loc[:, ['Season', 'NumOT', 'Score_Ratio', 'Score_Pct']]

n = X_train.shape[0]

train_test_merge = pd.concat( [X_train, X_test], axis = 0 ).reset_index(drop = True)

train_test_merge = pd.concat( [pd.get_dummies( train_test_merge['Season'].astype(object) ), 
            train_test_merge.drop('Season', axis = 1) ], axis = 1 )

train_test_merge = pd.concat( [pd.get_dummies( train_test_merge['NumOT'].astype(object) ), 
            train_test_merge.drop('NumOT', axis = 1) ], axis = 1 )

X_train = train_test_merge.loc[:(n - 1), :].reset_index(drop = True)
X_test = train_test_merge.loc[n:, :].reset_index(drop = True)



In [15]:

    
x_max = X_train.max()
x_min = X_train.min()

X_train = ( X_train - x_min ) / ( x_max - x_min + 1e-14)
X_test = ( X_test - x_min ) / ( x_max - x_min + 1e-14)



In [16]:

    
train_labels.value_counts()









    Out[16]:





1.0    112
0.0    108
Name: Result, dtype: int64



In [17]:

    
X_train.head()









    Out[17]:







  
    
      
      0.0
      1.0
      2.0
      4.0
      2018
      Score_Ratio
      Score_Pct
    
  
  
    
      0
      1.0
      0.0
      0.0
      0.0
      0.0
      0.459079
      0.623536
    
    
      1
      1.0
      0.0
      0.0
      0.0
      0.0
      0.496385
      0.657950
    
    
      2
      1.0
      0.0
      0.0
      0.0
      0.0
      0.640596
      0.776708
    
    
      3
      1.0
      0.0
      0.0
      0.0
      0.0
      0.503285
      0.664134
    
    
      4
      1.0
      0.0
      0.0
      0.0
      0.0
      0.488899
      0.651178



In [18]:

    
from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(cv=80,scoring="neg_log_loss",random_state=1
                             #,penalty="l1"
                             #,Cs= Cs_#list(np.arange(1e-7,1e-9,-0.5e-9)) # [0.5,0.1,0.01,0.001] #list(np.power(1, np.arange(-10, 10)))
                             #,max_iter=1000, tol=1e-11
                             #,solver="liblinear"
                             #,n_jobs=4
                            )
model.fit(X_train, train_labels)

#---
Cs = model.Cs_
list(np.power(10.0, np.arange(-10, 10)))
dir(model)
sco = model.scores_[1].mean(axis=0)
#---
import matplotlib.pyplot as plt
plt.plot(Cs
    #np.log10(Cs)
    ,sco)
# plt.ylabel('some numbers')
plt.show()
sco.min()









    












    Out[18]:





-0.6928690068542296



In [19]:

    
Cs_= list(np.arange(1.1e-9 - 5e-11
                    ,1.051e-9 
                    ,0.2e-13))
len(Cs_)









    Out[19]:





51



In [20]:

    
Cs_= list(np.arange(1e-11
                    ,9.04e-11#1.0508e-9 
                    ,0.2e-12))
len(Cs_)









    Out[20]:





402



In [21]:

    
#Cs_= list(np.arange(5.6e-13             - ( (0.01e-13)*1)
#                    ,5.61e-13            - ( (0.01e-13)*1)#1.0508e-9 
#                    ,0.2e-15))
#len(Cs_)



In [22]:

    
Cs_= list(np.arange(1e-11
                    ,5.5e-11#1.0508e-9 
                    ,0.2e-12))
len(Cs_)









    Out[22]:





225



In [23]:

    
Cs_= list(np.arange(1e-14
                    ,5.5e-11#1.0508e-9 
                    ,0.2e-12))
len(Cs_)#awsome









    Out[23]:





275



In [24]:

    
#Cs_= list(np.arange(1.5e-11
#                    ,2.53e-11#1.0508e-9 
#                    ,0.2e-13)) #+[3.761e-11]
#len(Cs_)



In [25]:

    
#X_train.dtypes



In [26]:

    
Cs_= list(np.arange(1e-15
                    ,0.51e-10 #1.0508e-9 
                    ,0.1e-12))
len(Cs_)#new again









    Out[26]:





510



In [27]:

    
Cs_= list(np.arange(9e-14
                    ,10.1e-13 #1.0508e-9 
                    ,0.1e-14))
len(Cs_)#new again cont. lowerlevel









    Out[27]:





920



In [ ]:



In [28]:

    
Cs_= list(np.linspace(1e-19
                    ,0.610e-11 #1.0508e-9 
                    ,500))
len(Cs_)#new again cont. lowerlevel









    Out[28]:





500



In [29]:

    
Cs_= list(np.logspace(-12.5715
                    ,-12.5714 #1.0508e-9 
                    ,500))
len(Cs_)#new again cont. lowerlevel









    Out[29]:





500



In [30]:

    
#LogisticRegressionCV(Cs=10, class_weight=None, cv=107, dual=False,
#           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
#           multi_class='ovr', n_jobs=1, penalty='l2', random_state=2,
#           refit=True, scoring='neg_log_loss', solver='lbfgs', tol=0.0001,
#           verbose=0) #-0.7
from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(cv=80,scoring="neg_log_loss",random_state=1
                             #,penalty="l1"
                             ,Cs= Cs_#list(np.arange(1e-7,1e-9,-0.5e-9)) # [0.5,0.1,0.01,0.001] #list(np.power(1, np.arange(-10, 10)))
                             ,max_iter=1000, tol=1e-11
                             #,solver="liblinear"
                             ,n_jobs=4)
model.fit(X_train, train_labels)

#---
Cs = model.Cs_
list(np.power(10.0, np.arange(-10, 10)))
dir(model)
sco = model.scores_[1].mean(axis=0)
#---
import matplotlib.pyplot as plt
plt.plot(#Cs
    np.log10(Cs)
    ,sco)
# plt.ylabel('some numbers')
index_min = np.argmin(sco)
Cs_[index_min]
plt.axvline(x=np.log10(Cs_[index_min]))

plt.show()



In [31]:

    
index_min = np.argmin(sco)
Cs_[index_min] #3.761e-11









    Out[31]:





2.6822546076084914e-13



In [32]:

    
#list(np.power(10.0, np.arange(-10, 10)))
#list(np.arange(0.5,1e-4,-0.05))
print(sco.max())
#-0.6931471779248422
print(sco.min() < -0.693270048530996)
print(sco.min()+0.693270048530996)
sco.min()









    



-0.69314718032509
False
0.0001228679716934744






    Out[32]:





-0.6931471805593026



In [33]:

    
import matplotlib.pyplot as plt
plt.plot(model.scores_[1])
# plt.ylabel('some numbers')
plt.show()

Here we store our probabilities



In [34]:

    
train_test_inner['Pred1'] = model.predict_proba(X_train)[:,1]
train_test_missing['Pred1'] = model.predict_proba(X_test)[:,1]

We merge our predictions



In [35]:

    
sub = pd.merge(df_sample_sub, 
                         pd.concat( [train_test_missing.loc[:, ['Season', 'TeamID1', 'TeamID2', 'Pred1']],
                                     train_test_inner.loc[:, ['Season', 'TeamID1', 'TeamID2', 'Pred1']] ],
                                   axis = 0).reset_index(drop = True),
                  on = ['Season', 'TeamID1', 'TeamID2'], how = 'outer')

We get the 'average' probability of success for each team



In [36]:

    
team1_probs = sub.groupby('TeamID1')['Pred1'].apply(lambda x : (x ** -1.0).mean() ** -1.0 ).fillna(0.5).to_dict()
team2_probs = sub.groupby('TeamID2')['Pred1'].apply(lambda x : (x ** -1.0).mean() ** -1.0 ).fillna(0.5).to_dict()

Any missing value for the prediciton will be imputed with the product of the probabilities calculated above. We assume these are independent events.



In [37]:

    
sub['Pred'] = sub[['TeamID1', 'TeamID2','Pred1']]\
.apply(lambda x : team1_probs.get(x[0]) * ( 1 - team2_probs.get(x[1]) ) if np.isnan(x[2]) else x[2], 
       axis = 1)



In [38]:

    
sub = sub.drop_duplicates(subset=["ID"], keep='first')

sub[['ID', 'Pred']].to_csv('sub.csv', index = False)



In [39]:

    
sub[['ID', 'Pred']].head(20)









    Out[39]:







  
    
      
      ID
      Pred
    
  
  
    
      0
      2018_3110_3113
      0.25
    
    
      1
      2018_3110_3114
      0.25
    
    
      2
      2018_3110_3124
      0.25
    
    
      3
      2018_3110_3125
      0.25
    
    
      4
      2018_3110_3129
      0.25
    
    
      5
      2018_3110_3138
      0.25
    
    
      6
      2018_3110_3141
      0.50
    
    
      7
      2018_3110_3143
      0.50
    
    
      8
      2018_3110_3163
      0.50
    
    
      9
      2018_3110_3166
      0.25
    
    
      10
      2018_3110_3169
      0.25
    
    
      11
      2018_3110_3173
      0.25
    
    
      12
      2018_3110_3177
      0.50
    
    
      13
      2018_3110_3179
      0.50
    
    
      14
      2018_3110_3181
      0.25
    
    
      15
      2018_3110_3189
      0.25
    
    
      16
      2018_3110_3195
      0.50
    
    
      17
      2018_3110_3199
      0.50
    
    
      18
      2018_3110_3203
      0.50
    
    
      20
      2018_3110_3208
      0.25



In [ ]:

	DayNum	DiffSeed	Result	Season	Team1_Score	Team2_Score	TeamID1	TeamID2	Score_Ratio	Score_Total	Score_Pct
0	11	1	1.0	2018	83	67	3234	3346	1.238806	150	0.553333
1	12	1	1.0	2018	66	51	3281	3346	1.294118	117	0.564103
2	18	1	1.0	2018	95	63	3326	3346	1.507937	158	0.601266
3	40	0	1.0	2018	60	46	3343	3346	1.304348	106	0.566038
4	11	1	1.0	2018	68	53	3280	3438	1.283019	121	0.561983

	Season	TeamID1	TeamID2	NumOT2	NumOT
0	2018	3110	3141	0.0	0.0
1	2018	3110	3143	0.0	0.0
2	2018	3110	3163	0.0	0.0
3	2018	3110	3177	0.0	0.0
4	2018	3110	3179	4.0	4.0

	Season	TeamID1	Score_Ratio1	Score_Pct1	TeamID2	Score_Ratio2	Score_Pct2	Score_Ratio	Score_Pct
0	2018	3110	0.822194	0.449524	3141	1.036585	0.508982	0.890604	0.469356
1	2018	3110	0.822194	0.449524	3143	1.241952	0.552779	0.813645	0.448369
2	2018	3110	0.822194	0.449524	3163	0.573171	0.364341	1.197692	0.526628
3	2018	3110	0.822194	0.449524	3177	0.870130	0.465278	0.972065	0.488436
4	2018	3110	0.822194	0.449524	3179	1.080000	0.519231	0.872520	0.464622

	0.0	Score_Ratio	Score_Pct
0	1.0	0.459079	0.623536
1	1.0	0.496385	0.657950
2	1.0	0.640596	0.776708
3	1.0	0.503285	0.664134
4	1.0	0.488899	0.651178

	ID	Pred
0	2018_3110_3113	0.25
1	2018_3110_3114	0.25
2	2018_3110_3124	0.25
3	2018_3110_3125	0.25
4	2018_3110_3129	0.25
5	2018_3110_3138	0.25
6	2018_3110_3141	0.50
7	2018_3110_3143	0.50
8	2018_3110_3163	0.50
9	2018_3110_3166	0.25
10	2018_3110_3169	0.25
11	2018_3110_3173	0.25
12	2018_3110_3177	0.50
13	2018_3110_3179	0.50
14	2018_3110_3181	0.25
15	2018_3110_3189	0.25
16	2018_3110_3195	0.50
17	2018_3110_3199	0.50
18	2018_3110_3203	0.50
20	2018_3110_3208	0.25