In [1]:

    
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

First we import some datasets of interest



In [2]:

    
#the seed information
df_seeds = pd.read_csv('../input/WNCAATourneySeeds_SampleTourney2018.csv')

#tour information
df_tour = pd.read_csv('../input/WRegularSeasonCompactResults_PrelimData2018.csv')

Now we separate the winners from the losers and organize our dataset



In [3]:

    
df_seeds['seed_int'] = df_seeds['Seed'].apply( lambda x : int(x[1:3]) )

df_winseeds = df_seeds.loc[:, ['TeamID', 'Season', 'seed_int']].rename(columns={'TeamID':'WTeamID', 'seed_int':'WSeed'})
df_lossseeds = df_seeds.loc[:, ['TeamID', 'Season', 'seed_int']].rename(columns={'TeamID':'LTeamID', 'seed_int':'LSeed'})
df_dummy = pd.merge(left=df_tour, right=df_winseeds, how='left', on=['Season', 'WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_lossseeds, on=['Season', 'LTeamID'])

Now we match the detailed results to the merge dataset above



In [4]:

    
df_concat['DiffSeed'] = df_concat[['LSeed', 'WSeed']].apply(lambda x : 0 if x[0] == x[1] else 1, axis = 1)

Here we get our submission info



In [5]:

    
#prepares sample submission
df_sample_sub = pd.read_csv('../input/WSampleSubmissionStage2.csv')



In [6]:

    
df_sample_sub['Season'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[0]) )
df_sample_sub['TeamID1'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[1]) )
df_sample_sub['TeamID2'] = df_sample_sub['ID'].apply(lambda x : int(x.split('_')[2]) )

Training Data Creation



In [7]:

    
winners = df_concat.rename( columns = { 'WTeamID' : 'TeamID1', 
                                                       'LTeamID' : 'TeamID2',
                                                      'WScore' : 'Team1_Score',
                                                      'LScore' : 'Team2_Score'}).drop(['WSeed', 'LSeed', 'WLoc'], axis = 1)
winners['Result'] = 1.0

losers = df_concat.rename( columns = { 'WTeamID' : 'TeamID2', 
                                                       'LTeamID' : 'TeamID1',
                                                      'WScore' : 'Team2_Score',
                                                      'LScore' : 'Team1_Score'}).drop(['WSeed', 'LSeed', 'WLoc'], axis = 1)

losers['Result'] = 0.0

train = pd.concat( [winners, losers], axis = 0).reset_index(drop = True)

train['Score_Ratio'] = train['Team1_Score'] / train['Team2_Score']
train['Score_Total'] = train['Team1_Score'] + train['Team2_Score']
train['Score_Pct'] = train['Team1_Score'] / train['Score_Total']

We will only consider years relevant to our test submission



In [8]:

    
df_sample_sub['Season'].unique()

Now lets just look at TeamID2, or just the second team info.



In [9]:

    
train_test_inner = pd.merge( train.loc[ train['Season'].isin([2018]), : ].reset_index(drop = True), 
         df_sample_sub.drop(['ID', 'Pred'], axis = 1), 
         on = ['Season', 'TeamID1', 'TeamID2'], how = 'inner' )



In [10]:

    
train_test_inner.head()

From the inner join, we will create data per team id to estimate the parameters we are missing that are independent of the year. Essentially, we are trying to estimate the average behavior of the team across the year.



In [11]:

    
team1d_num_ot = train_test_inner.groupby(['Season', 'TeamID1'])['NumOT'].median().reset_index()\
.set_index('Season').rename(columns = {'NumOT' : 'NumOT1'})
team2d_num_ot = train_test_inner.groupby(['Season', 'TeamID2'])['NumOT'].median().reset_index()\
.set_index('Season').rename(columns = {'NumOT' : 'NumOT2'})

num_ot = team1d_num_ot.join(team2d_num_ot).reset_index()

#sum the number of ot calls and subtract by one to prevent overcounting
num_ot['NumOT'] = num_ot[['NumOT1', 'NumOT2']].apply(lambda x : round( x.sum() ), axis = 1 )

num_ot.head()

Here we look at the comparable statistics. For the TeamID2 column, we would consider the inverse of the ratio, and 1 minus the score attempt percentage.



In [12]:

    
team1d_score_spread = train_test_inner.groupby(['Season', 'TeamID1'])[['Score_Ratio', 'Score_Pct']].median().reset_index()\
.set_index('Season').rename(columns = {'Score_Ratio' : 'Score_Ratio1', 'Score_Pct' : 'Score_Pct1'})
team2d_score_spread = train_test_inner.groupby(['Season', 'TeamID2'])[['Score_Ratio', 'Score_Pct']].median().reset_index()\
.set_index('Season').rename(columns = {'Score_Ratio' : 'Score_Ratio2', 'Score_Pct' : 'Score_Pct2'})

score_spread = team1d_score_spread.join(team2d_score_spread).reset_index()

#geometric mean of score ratio of team 1 and inverse of team 2
score_spread['Score_Ratio'] = score_spread[['Score_Ratio1', 'Score_Ratio2']].apply(lambda x : ( x[0] * ( x[1] ** -1.0) ), axis = 1 ) ** 0.5

#harmonic mean of score pct
score_spread['Score_Pct'] = score_spread[['Score_Pct1', 'Score_Pct2']].apply(lambda x : 0.5*( x[0] ** -1.0 ) + 0.5*( 1.0 - x[1] ) ** -1.0, axis = 1 ) ** -1.0

score_spread.head()

Now lets create a model just solely based on the inner group and predict those probabilities.

We will get the teams with the missing result.



In [13]:

    
X_train = train_test_inner.loc[:, ['Season', 'NumOT', 'Score_Ratio', 'Score_Pct']]
train_labels = train_test_inner['Result']

train_test_outer = pd.merge( train.loc[ train['Season'].isin([2014, 2015, 2016, 2017]), : ].reset_index(drop = True), 
         df_sample_sub.drop(['ID', 'Pred'], axis = 1), 
         on = ['Season', 'TeamID1', 'TeamID2'], how = 'outer' )

train_test_outer = train_test_outer.loc[ train_test_outer['Result'].isnull(), 
                                        ['TeamID1', 'TeamID2', 'Season']]

train_test_missing = pd.merge( pd.merge( score_spread.loc[:, ['TeamID1', 'TeamID2', 'Season', 'Score_Ratio', 'Score_Pct']], 
                   train_test_outer, on = ['TeamID1', 'TeamID2', 'Season']),
         num_ot.loc[:, ['TeamID1', 'TeamID2', 'Season', 'NumOT']],
         on = ['TeamID1', 'TeamID2', 'Season'])

We scale our data for our keras classifier, and make sure our categorical variables are properly processed.



In [14]:

    
X_test = train_test_missing.loc[:, ['Season', 'NumOT', 'Score_Ratio', 'Score_Pct']]

n = X_train.shape[0]

train_test_merge = pd.concat( [X_train, X_test], axis = 0 ).reset_index(drop = True)

train_test_merge = pd.concat( [pd.get_dummies( train_test_merge['Season'].astype(object) ), 
            train_test_merge.drop('Season', axis = 1) ], axis = 1 )

train_test_merge = pd.concat( [pd.get_dummies( train_test_merge['NumOT'].astype(object) ), 
            train_test_merge.drop('NumOT', axis = 1) ], axis = 1 )

X_train = train_test_merge.loc[:(n - 1), :].reset_index(drop = True)
X_test = train_test_merge.loc[n:, :].reset_index(drop = True)



In [15]:

    
x_max = X_train.max()
x_min = X_train.min()

X_train = ( X_train - x_min ) / ( x_max - x_min + 1e-14)
X_test = ( X_test - x_min ) / ( x_max - x_min + 1e-14)



In [16]:

    
train_labels.value_counts()



In [17]:

    
X_train.head()



In [18]:

    
from sklearn.linear_model import LogisticRegressionCV
model = LogisticRegressionCV(cv=80,scoring="neg_log_loss",random_state=1
                             #,penalty="l1"
                             #,Cs= Cs_#list(np.arange(1e-7,1e-9,-0.5e-9)) # [0.5,0.1,0.01,0.001] #list(np.power(1, np.arange(-10, 10)))
                             #,max_iter=1000, tol=1e-11
                             #,solver="liblinear"
                             #,n_jobs=4
                            )
model.fit(X_train, train_labels)

#---
Cs = model.Cs_
list(np.power(10.0, np.arange(-10, 10)))
dir(model)
sco = model.scores_[1].mean(axis=0)
#---
import matplotlib.pyplot as plt
plt.plot(Cs
    #np.log10(Cs)
    ,sco)
# plt.ylabel('some numbers')
plt.show()
sco.min()



In [19]:

    
Cs_= list(np.arange(1.1e-9 - 5e-11
                    ,1.051e-9 
                    ,0.2e-13))
len(Cs_)



In [20]:

    
Cs_= list(np.arange(1e-11
                    ,9.04e-11#1.0508e-9 
                    ,0.2e-12))
len(Cs_)



In [21]:

    
#Cs_= list(np.arange(5.6e-13             - ( (0.01e-13)*1)
#                    ,5.61e-13            - ( (0.01e-13)*1)#1.0508e-9 
#                    ,0.2e-15))
#len(Cs_)



In [22]:

    
Cs_= list(np.arange(1e-11
                    ,5.5e-11#1.0508e-9 
                    ,0.2e-12))
len(Cs_)



In [23]:

    
Cs_= list(np.arange(1e-14
                    ,5.5e-11#1.0508e-9 
                    ,0.2e-12))
len(Cs_)#awsome



In [24]:

    
#Cs_= list(np.arange(1.5e-11
#                    ,2.53e-11#1.0508e-9 
#                    ,0.2e-13)) #+[3.761e-11]
#len(Cs_)



In [25]:

    
#X_train.dtypes



In [26]:

    
Cs_= list(np.arange(1e-15
                    ,0.51e-10 #1.0508e-9 
                    ,0.1e-12))
len(Cs_)#new again



In [27]:

    
Cs_= list(np.arange(9e-14
                    ,10.1e-13 #1.0508e-9 
                    ,0.1e-14))
len(Cs_)#new again cont. lowerlevel



In [28]:

    
Cs_= list(np.arange(9e-14
                    ,10.1e-13 #1.0508e-9 
                    ,0.1e-14))
len(Cs_)#new again cont. lowerlevel



In [34]:

    
#LogisticRegressionCV(Cs=10, class_weight=None, cv=107, dual=False,
#           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
#           multi_class='ovr', n_jobs=1, penalty='l2', random_state=2,
#           refit=True, scoring='neg_log_loss', solver='lbfgs', tol=0.0001,
#           verbose=0) #-0.7
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(scoring="neg_log_loss",random_state=1
                             #,penalty="l1"
                             ,C=8.129999999999969e-13#list(np.arange(1e-7,1e-9,-0.5e-9)) # [0.5,0.1,0.01,0.001] #list(np.power(1, np.arange(-10, 10)))
                             ,max_iter=1000, tol=1e-11
                             #,solver="liblinear"
                             ,n_jobs=4)
model.fit(X_train, train_labels)

#---
Cs = model.Cs_
list(np.power(10.0, np.arange(-10, 10)))
dir(model)
sco = model.scores_[1].mean(axis=0)
#---
import matplotlib.pyplot as plt
plt.plot(Cs
    #np.log10(Cs)
    ,sco)
# plt.ylabel('some numbers')
plt.show()



In [96]:

    
Cs= list(np.linspace(9e-15
                    ,10.1e-14 #1.0508e-9 
                    ,200))
len(Cs)#new again cont. lowerlevel



In [97]:

    
from sklearn import svm, grid_search, datasets
parameters = dict(C=Cs)
model = LogisticRegression(random_state=1
                             #,penalty="l1"
                             ,C=8.129999999999969e-13#list(np.arange(1e-7,1e-9,-0.5e-9)) # [0.5,0.1,0.01,0.001] #list(np.power(1, np.arange(-10, 10)))
                             ,max_iter=1000, tol=1e-11
                             ,solver="lbfgs"
                             ,n_jobs=1)
clf = grid_search.GridSearchCV(model, parameters,scoring="neg_log_loss",cv=80,n_jobs=8)
clf.fit(X_train, train_labels)

scores = [x[1] for x in clf.grid_scores_]
scores = np.array(scores).reshape(len(Cs))

plt.plot(Cs, scores)
plt.legend()
plt.xlabel('Cs')
plt.ylabel('Mean score')
plt.show()



In [98]:

    
print("C:",clf.best_estimator_.C," loss:",clf.best_score_)
clf.grid_scores_



In [99]:

    
scores = [x[1] for x in clf.grid_scores_]
scores = np.array(scores).reshape(len(Cs))

plt.plot(Cs, scores)
plt.legend()
plt.xlabel('Cs')
plt.ylabel('Mean score')
plt.show()



In [46]:

    
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(clf.grid_scores_)
# plt.ylabel('some numbers')
plt.show()



In [30]:

    
index_min = np.argmin(sco)
Cs_[index_min] #3.761e-11
sco.min()



In [31]:

    
#list(np.power(10.0, np.arange(-10, 10)))
#list(np.arange(0.5,1e-4,-0.05))
print(sco.max())
#-0.6931471779248422
print(sco.min() < -0.693270048530996)
print(sco.min()+0.693270048530996)
sco.min()



In [32]:

    
import matplotlib.pyplot as plt
plt.plot(model.scores_[1])
# plt.ylabel('some numbers')
plt.show()

Here we store our probabilities



In [ ]:

    
train_test_inner['Pred1'] = model.predict_proba(X_train)[:,1]
train_test_missing['Pred1'] = model.predict_proba(X_test)[:,1]

We merge our predictions



In [ ]:

    
sub = pd.merge(df_sample_sub, 
                         pd.concat( [train_test_missing.loc[:, ['Season', 'TeamID1', 'TeamID2', 'Pred1']],
                                     train_test_inner.loc[:, ['Season', 'TeamID1', 'TeamID2', 'Pred1']] ],
                                   axis = 0).reset_index(drop = True),
                  on = ['Season', 'TeamID1', 'TeamID2'], how = 'outer')

We get the 'average' probability of success for each team



In [ ]:

    
team1_probs = sub.groupby('TeamID1')['Pred1'].apply(lambda x : (x ** -1.0).mean() ** -1.0 ).fillna(0.5).to_dict()
team2_probs = sub.groupby('TeamID2')['Pred1'].apply(lambda x : (x ** -1.0).mean() ** -1.0 ).fillna(0.5).to_dict()

Any missing value for the prediciton will be imputed with the product of the probabilities calculated above. We assume these are independent events.



In [ ]:

    
sub['Pred'] = sub[['TeamID1', 'TeamID2','Pred1']]\
.apply(lambda x : team1_probs.get(x[0]) * ( 1 - team2_probs.get(x[1]) ) if np.isnan(x[2]) else x[2], 
       axis = 1)



In [ ]:

    
sub = sub.drop_duplicates(subset=["ID"], keep='first')

sub[['ID', 'Pred']].to_csv('sub.csv', index = False)



In [ ]:

    
sub[['ID', 'Pred']].head(20)



In [ ]: