In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cross_validation import cross_val_score, StratifiedKFold , train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.multiclass import OneVsRestClassifier
import xgboost as xgb



In [2]:

    
match_df = pd.read_csv('match_cluster.csv')



In [3]:

    
match_df = match_df.drop(['Unnamed: 0','link_odsp','adv_stats'], axis = 1)
match_df.head()









    Out[3]:






  
    
      
      id_odsp
      date
      league
      season
      country
      ht
      at
      fthg
      ftag
      lead_change
      late_goals
      id_event
      avg_goal_diff
      cluster
      month
      year
    
  
  
    
      0
      UFot0hit/
      2011-08-05
      D1
      2012
      germany
      Borussia Dortmund
      Hamburg SV
      3
      1
      1.0
      1.0
      110
      1.835165
      0
      8
      2011
    
    
      1
      Aw5DflLH/
      2011-08-06
      D1
      2012
      germany
      FC Augsburg
      SC Freiburg
      2
      2
      4.0
      1.0
      122
      0.340659
      1
      8
      2011
    
    
      2
      bkjpaC6n/
      2011-08-06
      D1
      2012
      germany
      Werder Bremen
      Kaiserslautern
      2
      0
      1.0
      1.0
      137
      0.450549
      2
      8
      2011
    
    
      3
      CzPV312a/
      2011-08-06
      F1
      2012
      france
      Paris Saint-Germain
      Lorient
      0
      1
      1.0
      0.0
      114
      0.692308
      2
      8
      2011
    
    
      4
      GUOdmtII/
      2011-08-06
      F1
      2012
      france
      Caen
      Valenciennes
      1
      0
      1.0
      0.0
      90
      0.626374
      3
      8
      2011



In [54]:

    
england = match_df[match_df.country == 'england']
england.head()









    Out[54]:






  
    
      
      id_odsp
      date
      league
      season
      country
      ht
      at
      fthg
      ftag
      lead_change
      late_goals
      id_event
      avg_goal_diff
      cluster
      month
      year
    
  
  
    
      3062
      foAjBseC/
      2013-10-05
      E0
      2014
      england
      Hull
      Aston Villa
      0
      0
      0.0
      0.0
      80
      0.000000
      3
      10
      2013
    
    
      3063
      GfBnCNu6/
      2013-10-05
      E0
      2014
      england
      Fulham
      Stoke City
      1
      0
      1.0
      1.0
      104
      0.136842
      3
      10
      2013
    
    
      3067
      j7h8TOAt/
      2013-10-05
      E0
      2014
      england
      Sunderland
      Manchester Utd
      1
      2
      3.0
      0.0
      94
      0.824176
      1
      10
      2013
    
    
      3068
      l89fA1AI/
      2013-10-05
      E0
      2014
      england
      Liverpool
      Crystal Palace
      3
      1
      1.0
      0.0
      81
      2.105263
      0
      10
      2013
    
    
      3069
      ngSc9LPO/
      2013-10-05
      E0
      2014
      england
      Manchester City
      Everton
      3
      1
      3.0
      0.0
      97
      0.831579
      1
      10
      2013



In [97]:

    
spain = match_df[match_df.country == 'spain']
spain.head()









    Out[97]:






  
    
      
      id_odsp
      date
      league
      season
      country
      ht
      at
      fthg
      ftag
      lead_change
      late_goals
      id_event
      avg_goal_diff
      cluster
      month
      year
    
  
  
    
      59
      CtEdUSdJ/
      2011-08-27
      SP1
      2012
      spain
      Sporting Gijon
      Real Sociedad
      1
      2
      1.0
      0.0
      99
      0.648352
      3
      8
      2011
    
    
      63
      jRPmWlS6/
      2011-08-27
      SP1
      2012
      spain
      Valencia
      Racing Santander
      4
      3
      5.0
      2.0
      103
      0.890110
      1
      8
      2011
    
    
      73
      C88uY13B/
      2011-08-28
      SP1
      2012
      spain
      Real Zaragoza
      Real Madrid
      0
      6
      1.0
      2.0
      137
      2.087912
      0
      8
      2011
    
    
      76
      tvgGyp4n/
      2011-08-28
      SP1
      2012
      spain
      Sevilla
      Malaga
      2
      1
      1.0
      1.0
      85
      1.582418
      0
      8
      2011
    
    
      77
      UaoTYsl5/
      2011-08-28
      SP1
      2012
      spain
      Athletic Bilbao
      Rayo Vallecano
      1
      1
      2.0
      0.0
      117
      0.065934
      2
      8
      2011



In [98]:

    
germany = match_df[match_df.country == 'germany']
germany.head()









    Out[98]:






  
    
      
      id_odsp
      date
      league
      season
      country
      ht
      at
      fthg
      ftag
      lead_change
      late_goals
      id_event
      avg_goal_diff
      cluster
      month
      year
    
  
  
    
      0
      UFot0hit/
      2011-08-05
      D1
      2012
      germany
      Borussia Dortmund
      Hamburg SV
      3
      1
      1.0
      1.0
      110
      1.835165
      0
      8
      2011
    
    
      1
      Aw5DflLH/
      2011-08-06
      D1
      2012
      germany
      FC Augsburg
      SC Freiburg
      2
      2
      4.0
      1.0
      122
      0.340659
      1
      8
      2011
    
    
      2
      bkjpaC6n/
      2011-08-06
      D1
      2012
      germany
      Werder Bremen
      Kaiserslautern
      2
      0
      1.0
      1.0
      137
      0.450549
      2
      8
      2011
    
    
      5
      lOpzwMkp/
      2011-08-06
      D1
      2012
      germany
      Hertha Berlin
      Nurnberg
      0
      1
      1.0
      1.0
      135
      0.120879
      2
      8
      2011
    
    
      9
      Wn69eU5B/
      2011-08-06
      D1
      2012
      germany
      FC Cologne
      VfL Wolfsburg
      0
      3
      1.0
      2.0
      118
      0.890110
      2
      8
      2011



In [99]:

    
france = match_df[match_df.country == 'france']
france.head()









    Out[99]:






  
    
      
      id_odsp
      date
      league
      season
      country
      ht
      at
      fthg
      ftag
      lead_change
      late_goals
      id_event
      avg_goal_diff
      cluster
      month
      year
    
  
  
    
      3
      CzPV312a/
      2011-08-06
      F1
      2012
      france
      Paris Saint-Germain
      Lorient
      0
      1
      1.0
      0.0
      114
      0.692308
      2
      8
      2011
    
    
      4
      GUOdmtII/
      2011-08-06
      F1
      2012
      france
      Caen
      Valenciennes
      1
      0
      1.0
      0.0
      90
      0.626374
      3
      8
      2011
    
    
      6
      M7PhlM2C/
      2011-08-06
      F1
      2012
      france
      Brest
      Evian Thonon Gaillard
      2
      2
      2.0
      0.0
      115
      0.516484
      2
      8
      2011
    
    
      7
      QuWqjrYa/
      2011-08-06
      F1
      2012
      france
      AC Ajaccio
      Toulouse
      0
      2
      1.0
      0.0
      118
      0.703297
      2
      8
      2011
    
    
      8
      UBZQ4smg/
      2011-08-06
      F1
      2012
      france
      Nice
      Lyon
      1
      3
      3.0
      0.0
      129
      0.868132
      1
      8
      2011



In [100]:

    
italy = match_df[match_df.country == 'italy']
italy.head()









    Out[100]:






  
    
      
      id_odsp
      date
      league
      season
      country
      ht
      at
      fthg
      ftag
      lead_change
      late_goals
      id_event
      avg_goal_diff
      cluster
      month
      year
    
  
  
    
      84
      W0scuwXc/
      2011-09-09
      I1
      2012
      italy
      AC Milan
      Lazio
      2
      2
      2.0
      0.0
      111
      0.318681
      2
      9
      2011
    
    
      102
      zesOOHnH/
      2011-09-10
      I1
      2012
      italy
      Cesena
      Napoli
      1
      3
      3.0
      1.0
      104
      0.538462
      1
      9
      2011
    
    
      105
      Ait1vcn4/
      2011-09-11
      I1
      2012
      italy
      Palermo
      Internazionale
      4
      3
      5.0
      3.0
      118
      0.274725
      1
      9
      2011
    
    
      107
      EotSNy2N/
      2011-09-11
      I1
      2012
      italy
      Chievo Verona
      Novara
      2
      2
      2.0
      1.0
      122
      0.912088
      2
      9
      2011
    
    
      110
      jiZlsa2o/
      2011-09-11
      I1
      2012
      italy
      Juventus
      Parma
      4
      1
      1.0
      2.0
      86
      1.461538
      0
      9
      2011



In [4]:

    
## Easier to write a function to return league and year dataframe instead of pipeline it

def dummy_year_pipe(df, yr):
    ## df for only that league and season
    df_year = df[df.season == yr]
    
    ## getting unique clubs to create dummy vars for which two teams are playing
    df_yr_clubs = pd.unique(df_year[['ht','at']].values.ravel())
    
    ## drop first alphabetical club for comparison, return in DF to append to original df
    clubs = np.sort(df_yr_clubs)[1:len(df_yr_clubs)]
    club_df = pd.DataFrame(columns = clubs)
    
    df_year = df_year.append(club_df)
    
    for club in clubs:
        for i, row in df_year.iterrows():
            if row['ht'] == club or row['at'] == club:
                dummy = 1
            else:
                dummy = 0
            df_year.loc[i, club] = dummy
            
    return df_year



In [8]:

    
X_train = train[[u'AC Milan', u'AJ Auxerre', u'AS Nancy Lorraine', u'AS Roma',
       u'Atalanta', u'Athletic Bilbao', u'Atletico Madrid', u'Barcelona',
       u'Bayer Leverkusen', u'Bayern Munich', u'Bologna', u'Bordeaux',
       u'Borussia Dortmund', u'Borussia Monchengladbach', u'Brest', u'Caen',
       u'Cagliari', u'Catania', u'Cesena', u'Chievo Verona', u'Dijon FCO',
       u'Espanyol', u'Evian Thonon Gaillard', u'FC Augsburg', u'FC Cologne',
       u'Fiorentina', u'Genoa', u'Getafe', u'Granada', u'Hamburg SV',
       u'Hannover 96', u'Hertha Berlin', u'Internazionale', u'Juventus',
       u'Kaiserslautern', u'Lazio', u'Lecce', u'Levante', u'Lille', u'Lorient',
       u'Lyon', u'Mainz', u'Malaga', u'Mallorca', u'Marseille', u'Montpellier',
       u'Napoli', u'Nice', u'Novara', u'Nurnberg', u'Osasuna', u'Palermo',
       u'Paris Saint-Germain', u'Parma', u'Racing Santander',
       u'Rayo Vallecano', u'Real Betis', u'Real Madrid', u'Real Sociedad',
       u'Real Zaragoza', u'SC Freiburg', u'Schalke 04', u'Sevilla', u'Siena',
       u'Sochaux', u'Sporting Gijon', u'St Etienne', u'Stade Rennes',
       u'TSG Hoffenheim', u'Toulouse', u'Udinese', u'Valencia',
       u'Valenciennes', u'VfB Stuttgart', u'VfL Wolfsburg', u'Villarreal',
       u'Werder Bremen']]
y_train = train['cluster']
X_test = test[[u'AC Milan', u'AJ Auxerre', u'AS Nancy Lorraine', u'AS Roma',
       u'Atalanta', u'Athletic Bilbao', u'Atletico Madrid', u'Barcelona',
       u'Bayer Leverkusen', u'Bayern Munich', u'Bologna', u'Bordeaux',
       u'Borussia Dortmund', u'Borussia Monchengladbach', u'Brest', u'Caen',
       u'Cagliari', u'Catania', u'Cesena', u'Chievo Verona', u'Dijon FCO',
       u'Espanyol', u'Evian Thonon Gaillard', u'FC Augsburg', u'FC Cologne',
       u'Fiorentina', u'Genoa', u'Getafe', u'Granada', u'Hamburg SV',
       u'Hannover 96', u'Hertha Berlin', u'Internazionale', u'Juventus',
       u'Kaiserslautern', u'Lazio', u'Lecce', u'Levante', u'Lille', u'Lorient',
       u'Lyon', u'Mainz', u'Malaga', u'Mallorca', u'Marseille', u'Montpellier',
       u'Napoli', u'Nice', u'Novara', u'Nurnberg', u'Osasuna', u'Palermo',
       u'Paris Saint-Germain', u'Parma', u'Racing Santander',
       u'Rayo Vallecano', u'Real Betis', u'Real Madrid', u'Real Sociedad',
       u'Real Zaragoza', u'SC Freiburg', u'Schalke 04', u'Sevilla', u'Siena',
       u'Sochaux', u'Sporting Gijon', u'St Etienne', u'Stade Rennes',
       u'TSG Hoffenheim', u'Toulouse', u'Udinese', u'Valencia',
       u'Valenciennes', u'VfB Stuttgart', u'VfL Wolfsburg', u'Villarreal',
       u'Werder Bremen']]
y_test = test['cluster']



In [9]:

    
X_train = X_train.astype(int)
X_test = X_test.astype(int)



In [24]:

    
## guide for using multi-class prediction XGBoost

xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 4

watchlist = [(xg_train,'train'), (xg_test, 'test')]
num_round = 5
bst = xgb.train(param, xg_train, num_round, watchlist );
# get prediction
pred = bst.predict( xg_test );









    



[0]	train-merror:0.556463	test-merror:0.65528
[1]	train-merror:0.545578	test-merror:0.663043
[2]	train-merror:0.545578	test-merror:0.661491
[3]	train-merror:0.540136	test-merror:0.667702
[4]	train-merror:0.540136	test-merror:0.667702



In [ ]:

    
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist );
yprob = bst.predict( xg_test ).reshape( y_test.shape[0], 4 )
ylabel = np.argmax(yprob, axis=1)



In [27]:

    
yprob









    Out[27]:





array([[ 0.22992329,  0.20177852,  0.32427084,  0.24402735],
       [ 0.22027227,  0.2495105 ,  0.29643291,  0.23378432],
       [ 0.26139805,  0.23211528,  0.2890012 ,  0.21748547],
       ..., 
       [ 0.21509743,  0.24364877,  0.30336127,  0.23789248],
       [ 0.20529623,  0.2325466 ,  0.28953823,  0.27261895],
       [ 0.2171825 ,  0.2460106 ,  0.30630195,  0.23050499]], dtype=float32)



In [29]:

    
print 'Model Accuracy Score:' 
print accuracy_score(y_test, pred)
print 'Model Confusion Matrix:' 
print confusion_matrix(y_test, pred)
print 'Model Classification Report:' 
print classification_report(y_test, pred)









    



Model Accuracy Score:
0.332298136646
Model Confusion Matrix:
[[ 11   9  74   4]
 [ 21   9 113  15]
 [ 15  22 187   9]
 [ 11  14 123   7]]
Model Classification Report:
             precision    recall  f1-score   support

        0.0       0.19      0.11      0.14        98
        1.0       0.17      0.06      0.08       158
        2.0       0.38      0.80      0.51       233
        3.0       0.20      0.05      0.07       155

avg / total       0.25      0.33      0.25       644



In [30]:

    
### Leveraged next few cells from sklearn to calculate and plot roc-auc for multi-class


from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from scipy import interp

# Binarize the output

y = label_binarize(y_test, classes=[0, 1, 2, 3])
n_classes = y.shape[1]


# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y[:, i], yprob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y.ravel(), yprob.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])



In [33]:

    
### from sklearn
# Compute macro-average ROC curve and ROC area

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = ['aqua', 'darkorange', 'cornflowerblue', 'green']
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

Model is created above.

The results above show that the model could use some work. Some of the features could be tweaked to allow these clusters to be slightly more predictive. However, we will continue with the clusters created to predict the watchability of all future matches in the rest of the 2016-2017 season for each league.

The model that we use takes in the two teams that are playing in a match, and then predicts the watchability based on those two teams. Since this data has unique matches occurring only within leagues, a predictive model will have to be trained and fit for each league for the current season. I have obtained the remaining league fixtures from the point after data was provided, and run each league through the same process to obtain results.

First, England.



In [191]:

    
england_train_17 = dummy_year_pipe(england, 2017)



In [192]:

    
epl_17 = pd.read_csv('epl_17_fixtures.csv', encoding = 'utf-8')



In [193]:

    
epl_17.loc[epl_17.Team1 == 'Leicester', 'Team1'] = 'Leicester City'
epl_17.loc[epl_17.Team2 == 'Leicester', 'Team2'] = 'Leicester City'
epl_17.loc[epl_17.Team1 == 'Manchester United', 'Team1'] = 'Manchester Utd'
epl_17.loc[epl_17.Team2 == 'Manchester United', 'Team2'] = 'Manchester Utd'
epl_17.loc[epl_17.Team1 == 'Stoke', 'Team1'] = 'Stoke City'
epl_17.loc[epl_17.Team2 == 'Stoke', 'Team2'] = 'Stoke City'
epl_17.loc[epl_17.Team1 == 'tottenham', 'Team1'] = 'Tottenham'
epl_17.loc[epl_17.Team2 == 'tottenham', 'Team2'] = 'Tottenham'
epl_17.loc[epl_17.Team1 == 'chelsea', 'Team1'] = 'Chelsea'
epl_17.loc[epl_17.Team2 == 'chelsea', 'Team2'] = 'Chelsea'
epl_17.loc[epl_17.Team1 == 'Arsenal', 'Team1'] = 'Arsenal'
epl_17.loc[epl_17.Team2 == 'Arsenal', 'Team2'] = 'Arsenal'



In [194]:

    
epl_17_clubs = pd.unique(epl_17[['Team1','Team2']].values.ravel())
    
    ## drop first alphabetical club for comparison, return in DF to append to original df
clubs = np.sort(epl_17_clubs)[1:len(epl_17_clubs)]
club_df = pd.DataFrame(columns = clubs)
    
epl_17 = epl_17.append(club_df)
    
for club in clubs:
    for i, row in epl_17.iterrows():
        if row['Team1'] == club or row['Team2'] == club:
            dummy = 1
        else:
            dummy = 0
        epl_17.loc[i, club] = dummy



In [195]:

    
X_train = england_train_17[[u'Bournemouth', u'Burnley', u'Chelsea', u'Crystal Palace', u'Everton',
       u'Hull', u'Leicester City', u'Liverpool', u'Manchester City',
       u'Manchester Utd', u'Middlesbrough', u'Southampton', u'Stoke City',
       u'Sunderland', u'Swansea', u'Tottenham', u'Watford', u'West Brom',
       u'West Ham']]
y_train = england_train_17['cluster']
X_test = epl_17[[u'Bournemouth', u'Burnley', u'Chelsea', u'Crystal Palace', u'Everton',
       u'Hull', u'Leicester City', u'Liverpool', u'Manchester City',
       u'Manchester Utd', u'Middlesbrough', u'Southampton', u'Stoke City',
       u'Sunderland', u'Swansea', u'Tottenham', u'Watford', u'West Brom',
       u'West Ham']]
## y_test = test['cluster']

X_train = X_train.astype(int)
X_test = X_test.astype(int)



In [ ]:



In [196]:

    
xg_train_17 = xgb.DMatrix( X_train, label=y_train)
xg_test_17 = xgb.DMatrix(X_test)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 4
param['eval_metric'] = 'auc'

num_round = 5
bst_17 = xgb.train(param, xg_train_17, num_round)
# get prediction
pred_17 = bst_17.predict( xg_test_17 );



In [177]:

    
pred_17









    Out[177]:





array([ 3.,  3.,  3.,  1.,  3.,  3.,  0.,  0.,  3.,  3.,  0.,  1.,  3.,
        3.,  3.,  3.,  3.,  0.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,
        3.,  1.,  3.,  1.,  3.,  3.,  1.,  3.,  3.,  3.,  3.,  3.,  3.,
        0.,  3.,  1.,  3.,  0.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  0.,
        1.,  3.,  3.,  1.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  0.,
        3.,  3.,  0.,  3.,  1.,  1.,  1.,  3.,  3.,  3.,  0.,  3.,  0.,
        3.,  3.,  1.,  3.,  0.,  3.,  3.,  3.,  3.,  1.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  0.,  1.,  3.,  3.,  3.,  1.,  1.,  3.,  3.,
        3.,  3.,  3.,  0.,  3.,  3.,  3.,  3.,  3.,  0.,  3.,  3.,  3.,
        1.,  3.,  2.,  3.,  1.,  3.,  3.,  3.,  3.,  3.,  2.,  3.,  3.,
        3.,  3.,  0.,  0.,  3.,  3.,  1.,  3.,  3.,  3.,  3.], dtype=float32)



In [197]:

    
epl_17_predict = epl_17.drop([u'Bournemouth', u'Burnley', u'Chelsea', u'Crystal Palace', u'Everton',
       u'Hull', u'Leicester City', u'Liverpool', u'Manchester City',
       u'Manchester Utd', u'Middlesbrough', u'Southampton', u'Stoke City',
       u'Sunderland', u'Swansea', u'Tottenham', u'Watford', u'West Brom',
       u'West Ham'], axis = 1)
epl_17_predict['cluster'] = pred_17



In [198]:

    
epl_17_predict









    Out[198]:






  
    
      
      Team1
      Team2
      date_time
      cluster
    
  
  
    
      0
      Arsenal
      Hull
      11:02. 13:30
      3.0
    
    
      1
      Manchester Utd
      Watford
      11:02. 16:00
      3.0
    
    
      2
      Middlesbrough
      Everton
      11:02. 16:00
      3.0
    
    
      3
      Stoke City
      Crystal Palace
      11:02. 16:00
      1.0
    
    
      4
      Sunderland
      Southampton
      11:02. 16:00
      3.0
    
    
      5
      West Ham
      West Brom
      11:02. 16:00
      3.0
    
    
      6
      Liverpool
      Tottenham
      11:02. 18:30
      0.0
    
    
      7
      Burnley
      Chelsea
      12:02. 14:30
      0.0
    
    
      8
      Swansea
      Leicester City
      12:02. 17:00
      3.0
    
    
      9
      Bournemouth
      Manchester City
      13:02. 21:00
      3.0
    
    
      10
      Chelsea
      Swansea
      25.02. 16:00
      0.0
    
    
      11
      Crystal Palace
      Middlesbrough
      25.02. 16:00
      1.0
    
    
      12
      Everton
      Sunderland
      25.02. 16:00
      3.0
    
    
      13
      Hull
      Burnley
      25.02. 16:00
      3.0
    
    
      14
      Southampton
      Arsenal
      25.02. 16:00
      3.0
    
    
      15
      West Brom
      Bournemouth
      25.02. 16:00
      3.0
    
    
      16
      Watford
      West Ham
      25.02. 18:30
      3.0
    
    
      17
      Tottenham
      Stoke City
      26.02. 14:30
      0.0
    
    
      18
      Manchester City
      Manchester Utd
      26.02. 15:15
      3.0
    
    
      19
      Leicester City
      Liverpool
      27.02. 21:00
      3.0
    
    
      20
      Manchester Utd
      Bournemouth
      04.03. 13:30
      3.0
    
    
      21
      Leicester City
      Hull
      04.03. 16:00
      3.0
    
    
      22
      Stoke City
      Middlesbrough
      04.03. 16:00
      3.0
    
    
      23
      Swansea
      Burnley
      04.03. 16:00
      3.0
    
    
      24
      Watford
      Southampton
      04.03. 16:00
      3.0
    
    
      25
      West Brom
      Crystal Palace
      04.03. 16:00
      3.0
    
    
      26
      Liverpool
      Arsenal
      04.03. 18:30
      3.0
    
    
      27
      Tottenham
      Everton
      05.03. 14:30
      1.0
    
    
      28
      Sunderland
      Manchester City
      05.03. 17:00
      3.0
    
    
      29
      West Ham
      Chelsea
      06.03. 21:00
      1.0
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      110
      Arsenal
      Manchester Utd
      06.05. 16:00
      3.0
    
    
      111
      Bournemouth
      Stoke City
      06.05. 16:00
      3.0
    
    
      112
      Burnley
      West Brom
      06.05. 16:00
      3.0
    
    
      113
      Chelsea
      Middlesbrough
      06.05. 16:00
      0.0
    
    
      114
      Hull
      Sunderland
      06.05. 16:00
      3.0
    
    
      115
      Leicester City
      Watford
      06.05. 16:00
      3.0
    
    
      116
      Liverpool
      Southampton
      06.05. 16:00
      3.0
    
    
      117
      Manchester City
      Crystal Palace
      06.05. 16:00
      1.0
    
    
      118
      Swansea
      Everton
      06.05. 16:00
      3.0
    
    
      119
      West Ham
      Tottenham
      06.05. 16:00
      2.0
    
    
      120
      Bournemouth
      Burnley
      13.05. 16:00
      3.0
    
    
      121
      Crystal Palace
      Hull
      13.05. 16:00
      1.0
    
    
      122
      Everton
      Watford
      13.05. 16:00
      3.0
    
    
      123
      Manchester City
      Leicester City
      13.05. 16:00
      3.0
    
    
      124
      Middlesbrough
      Southampton
      13.05. 16:00
      3.0
    
    
      125
      Stoke City
      Arsenal
      13.05. 16:00
      3.0
    
    
      126
      Sunderland
      Swansea
      13.05. 16:00
      3.0
    
    
      127
      Tottenham
      Manchester Utd
      13.05. 16:00
      2.0
    
    
      128
      West Brom
      Chelsea
      13.05. 16:00
      3.0
    
    
      129
      West Ham
      Liverpool
      13.05. 16:00
      3.0
    
    
      130
      Arsenal
      Everton
      21.05. 16:00
      3.0
    
    
      131
      Burnley
      West Ham
      21.05. 16:00
      3.0
    
    
      132
      Chelsea
      Sunderland
      21.05. 16:00
      0.0
    
    
      133
      Hull
      Tottenham
      21.05. 16:00
      0.0
    
    
      134
      Leicester City
      Bournemouth
      21.05. 16:00
      3.0
    
    
      135
      Liverpool
      Middlesbrough
      21.05. 16:00
      3.0
    
    
      136
      Manchester Utd
      Crystal Palace
      21.05. 16:00
      1.0
    
    
      137
      Southampton
      Stoke City
      21.05. 16:00
      3.0
    
    
      138
      Swansea
      West Brom
      21.05. 16:00
      3.0
    
    
      139
      Watford
      Manchester City
      21.05. 16:00
      3.0
    
  

140 rows × 4 columns

Spain Below



In [112]:

    
spain_train_17 = dummy_year_pipe(spain, 2017)

esp_17 = pd.read_csv('esp_17_fixtures.csv', encoding = 'utf-8')



In [113]:

    
esp_17.loc[esp_17.Team1 == 'Ath. Bilbao', 'Team1'] = 'Athletic Bilbao'
esp_17.loc[esp_17.Team2 == 'Ath. Bilbao', 'Team2'] = 'Athletic Bilbao'
esp_17.loc[esp_17.Team1 == 'Atl. Madrid', 'Team1'] = 'Atletico Madrid'
esp_17.loc[esp_17.Team2 == 'Atl. Madrid', 'Team2'] = 'Atletico Madrid'
esp_17.loc[esp_17.Team1 == 'Betis', 'Team1'] = 'Real Betis'
esp_17.loc[esp_17.Team2 == 'Betis', 'Team2'] = 'Real Betis'
esp_17.loc[esp_17.Team1 == 'Gijon', 'Team1'] = 'Sporting Gijon'
esp_17.loc[esp_17.Team2 == 'Gijon', 'Team2'] = 'Sporting Gijon'
esp_17.loc[esp_17.Team1 == 'La Coruna', 'Team1'] = 'Deportivo La Coruna'
esp_17.loc[esp_17.Team2 == 'La Coruna', 'Team2'] = 'Deportivo La Coruna'



In [114]:

    
esp_17_clubs = pd.unique(esp_17[['Team1','Team2']].values.ravel())
    
    ## drop first alphabetical club for comparison, return in DF to append to original df
clubs = np.sort(esp_17_clubs)[1:len(esp_17_clubs)]
club_df = pd.DataFrame(columns = clubs)
    
esp_17 = esp_17.append(club_df)
    
for club in clubs:
    for i, row in esp_17.iterrows():
        if row['Team1'] == club or row['Team2'] == club:
            dummy = 1
        else:
            dummy = 0
        esp_17.loc[i, club] = dummy
        
print spain_train_17.columns
print esp_17.columns









    



Index([u'Athletic Bilbao', u'Atletico Madrid', u'Barcelona', u'Celta Vigo',
       u'Deportivo La Coruna', u'Eibar', u'Espanyol', u'Granada',
       u'Las Palmas', u'Leganes', u'Malaga', u'Osasuna', u'Real Betis',
       u'Real Madrid', u'Real Sociedad', u'Sevilla', u'Sporting Gijon',
       u'Valencia', u'Villarreal', u'at', u'avg_goal_diff', u'cluster',
       u'country', u'date', u'ftag', u'fthg', u'ht', u'id_event', u'id_odsp',
       u'late_goals', u'lead_change', u'league', u'month', u'season', u'year'],
      dtype='object')
Index([    u'Athletic Bilbao',     u'Atletico Madrid',           u'Barcelona',
                u'Celta Vigo', u'Deportivo La Coruna',               u'Eibar',
                  u'Espanyol',             u'Granada',          u'Las Palmas',
                   u'Leganes',              u'Malaga',             u'Osasuna',
                u'Real Betis',         u'Real Madrid',       u'Real Sociedad',
                   u'Sevilla',      u'Sporting Gijon',               u'Team1',
                     u'Team2',            u'Valencia',          u'Villarreal',
                 u'date_time'],
      dtype='object')



In [115]:

    
X_train = spain_train_17[[u'Athletic Bilbao', u'Atletico Madrid', u'Barcelona', u'Celta Vigo',
       u'Deportivo La Coruna', u'Eibar', u'Espanyol', u'Granada',
       u'Las Palmas', u'Leganes', u'Malaga', u'Osasuna', u'Real Betis',
       u'Real Madrid', u'Real Sociedad', u'Sevilla', u'Sporting Gijon',
       u'Valencia', u'Villarreal']]
y_train = spain_train_17['cluster']
X_test = esp_17[[u'Athletic Bilbao', u'Atletico Madrid', u'Barcelona', u'Celta Vigo',
       u'Deportivo La Coruna', u'Eibar', u'Espanyol', u'Granada',
       u'Las Palmas', u'Leganes', u'Malaga', u'Osasuna', u'Real Betis',
       u'Real Madrid', u'Real Sociedad', u'Sevilla', u'Sporting Gijon',
       u'Valencia', u'Villarreal']]
## y_test = test['cluster']

X_train = X_train.astype(int)
X_test = X_test.astype(int)



In [116]:

    
xg_train_17 = xgb.DMatrix( X_train, label=y_train)
xg_test_17 = xgb.DMatrix(X_test)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 4
param['eval_metric'] = 'auc'

num_round = 5
bst_17 = xgb.train(param, xg_train_17, num_round)
# get prediction
pred_17 = bst_17.predict( xg_test_17 );



In [117]:

    
esp_17_predict = esp_17.drop([u'Athletic Bilbao', u'Atletico Madrid', u'Barcelona', u'Celta Vigo',
       u'Deportivo La Coruna', u'Eibar', u'Espanyol', u'Granada',
       u'Las Palmas', u'Leganes', u'Malaga', u'Osasuna', u'Real Betis',
       u'Real Madrid', u'Real Sociedad', u'Sevilla', u'Sporting Gijon',
       u'Valencia', u'Villarreal'], axis = 1)
esp_17_predict['cluster'] = pred_17









    Out[117]:






  
    
      
      Team1
      Team2
      date_time
      cluster
    
  
  
    
      0
      Espanyol
      Real Sociedad
      10.02. 20:45
      3.0
    
    
      1
      Real Betis
      Valencia
      11.02. 13:00
      1.0
    
    
      2
      Alaves
      Barcelona
      11.02. 16:15
      0.0
    
    
      3
      Athletic Bilbao
      Deportivo La Coruna
      11.02. 18:30
      1.0
    
    
      4
      Osasuna
      Real Madrid
      11.02. 20:45
      0.0



In [119]:

    
esp_17_predict.head()









    Out[119]:






  
    
      
      Team1
      Team2
      date_time
      cluster
    
  
  
    
      0
      Espanyol
      Real Sociedad
      10.02. 20:45
      3.0
    
    
      1
      Real Betis
      Valencia
      11.02. 13:00
      1.0
    
    
      2
      Alaves
      Barcelona
      11.02. 16:15
      0.0
    
    
      3
      Athletic Bilbao
      Deportivo La Coruna
      11.02. 18:30
      1.0
    
    
      4
      Osasuna
      Real Madrid
      11.02. 20:45
      0.0

Germany Below



In [128]:

    
germ_train_17 = dummy_year_pipe(germany, 2017)

ger_17 = pd.read_csv('ger_17_fixtures.csv', encoding = 'utf-8')

ger_17.loc[ger_17.Team1 == 'Dortmund', 'Team1'] = 'Borussia Dortmund'
ger_17.loc[ger_17.Team2 == 'Dortmund', 'Team2'] = 'Borussia Dortmund'
ger_17.loc[ger_17.Team1 == 'Monchengladbach', 'Team1'] = 'Borussia Monchengladbach'
ger_17.loc[ger_17.Team2 == 'Monchengladbach', 'Team2'] = 'Borussia Monchengladbach'
ger_17.loc[ger_17.Team1 == 'Darmstadt', 'Team1'] = 'SV Darmstadt 98'
ger_17.loc[ger_17.Team2 == 'Darmstadt', 'Team2'] = 'SV Darmstadt 98'
ger_17.loc[ger_17.Team1 == 'Leverkusen', 'Team1'] = 'Bayer Leverkusen'
ger_17.loc[ger_17.Team2 == 'Leverkusen', 'Team2'] = 'Bayer Leverkusen'
ger_17.loc[ger_17.Team1 == 'Augsburg', 'Team1'] = 'FC Augsburg'
ger_17.loc[ger_17.Team2 == 'Augsburg', 'Team2'] = 'FC Augsburg'
ger_17.loc[ger_17.Team1 == 'FC Koln', 'Team1'] = 'FC Cologne'
ger_17.loc[ger_17.Team2 == 'FC Koln', 'Team2'] = 'FC Cologne'
ger_17.loc[ger_17.Team1 == 'Freiburg', 'Team1'] = 'SC Freiburg'
ger_17.loc[ger_17.Team2 == 'Freiburg', 'Team2'] = 'SC Freiburg'
ger_17.loc[ger_17.Team1 == 'Hamburger SV', 'Team1'] = 'Hamburg SV'
ger_17.loc[ger_17.Team2 == 'Hamburger SV', 'Team2'] = 'Hamburg SV'
ger_17.loc[ger_17.Team1 == 'Frankfurt', 'Team1'] = 'Eintracht Frankfurt'
ger_17.loc[ger_17.Team2 == 'Frankfurt', 'Team2'] = 'Eintracht Frankfurt'
ger_17.loc[ger_17.Team1 == 'Hertha', 'Team1'] = 'Hertha Berlin'
ger_17.loc[ger_17.Team2 == 'Hertha', 'Team2'] = 'Hertha Berlin'
ger_17.loc[ger_17.Team1 == 'Hoffenheim', 'Team1'] = 'TSG Hoffenheim'
ger_17.loc[ger_17.Team2 == 'Hoffenheim', 'Team2'] = 'TSG Hoffenheim'
ger_17.loc[ger_17.Team1 == 'Ingolstadt', 'Team1'] = 'FC Ingolstadt 04'
ger_17.loc[ger_17.Team2 == 'Ingolstadt', 'Team2'] = 'FC Ingolstadt 04'
ger_17.loc[ger_17.Team1 == 'Schalke', 'Team1'] = 'Schalke 04'
ger_17.loc[ger_17.Team2 == 'Schalke', 'Team2'] = 'Schalke 04'
ger_17.loc[ger_17.Team1 == 'Wolfsburg', 'Team1'] = 'VfL Wolfsburg'
ger_17.loc[ger_17.Team2 == 'Wolfsburg', 'Team2'] = 'VfL Wolfsburg'



In [130]:

    
print germ_train_17.columns
print ger_17.columns









    



Index([u'Bayern Munich', u'Borussia Dortmund', u'Borussia Monchengladbach',
       u'Eintracht Frankfurt', u'FC Augsburg', u'FC Cologne',
       u'FC Ingolstadt 04', u'Hamburg SV', u'Hertha Berlin', u'Mainz',
       u'RB Leipzig', u'SC Freiburg', u'SV Darmstadt 98', u'Schalke 04',
       u'TSG Hoffenheim', u'VfL Wolfsburg', u'Werder Bremen', u'at',
       u'avg_goal_diff', u'cluster', u'country', u'date', u'ftag', u'fthg',
       u'ht', u'id_event', u'id_odsp', u'late_goals', u'lead_change',
       u'league', u'month', u'season', u'year'],
      dtype='object')
Index([           u'Bayern Munich',        u'Borussia Dortmund',
       u'Borussia Monchengladbach',      u'Eintracht Frankfurt',
                    u'FC Augsburg',               u'FC Cologne',
               u'FC Ingolstadt 04',               u'Hamburg SV',
                  u'Hertha Berlin',                    u'Mainz',
                     u'RB Leipzig',              u'SC Freiburg',
                u'SV Darmstadt 98',               u'Schalke 04',
                 u'TSG Hoffenheim',                    u'Team1',
                          u'Team2',            u'VfL Wolfsburg',
                  u'Werder Bremen',                u'date_time'],
      dtype='object')



In [129]:

    
ger_17_clubs = pd.unique(ger_17[['Team1','Team2']].values.ravel())
    
    ## drop first alphabetical club for comparison, return in DF to append to original df
clubs = np.sort(ger_17_clubs)[1:len(ger_17_clubs)]
club_df = pd.DataFrame(columns = clubs)
    
ger_17 = ger_17.append(club_df)
    
for club in clubs:
    for i, row in ger_17.iterrows():
        if row['Team1'] == club or row['Team2'] == club:
            dummy = 1
        else:
            dummy = 0
        ger_17.loc[i, club] = dummy



In [131]:

    
X_train = germ_train_17[[u'Bayern Munich', u'Borussia Dortmund', u'Borussia Monchengladbach',
       u'Eintracht Frankfurt', u'FC Augsburg', u'FC Cologne',
       u'FC Ingolstadt 04', u'Hamburg SV', u'Hertha Berlin', u'Mainz',
       u'RB Leipzig', u'SC Freiburg', u'SV Darmstadt 98', u'Schalke 04',
       u'TSG Hoffenheim', u'VfL Wolfsburg', u'Werder Bremen']]
y_train = germ_train_17['cluster']
X_test = ger_17[[u'Bayern Munich', u'Borussia Dortmund', u'Borussia Monchengladbach',
       u'Eintracht Frankfurt', u'FC Augsburg', u'FC Cologne',
       u'FC Ingolstadt 04', u'Hamburg SV', u'Hertha Berlin', u'Mainz',
       u'RB Leipzig', u'SC Freiburg', u'SV Darmstadt 98', u'Schalke 04',
       u'TSG Hoffenheim', u'VfL Wolfsburg', u'Werder Bremen']]
## y_test = test['cluster']

X_train = X_train.astype(int)
X_test = X_test.astype(int)



In [ ]:



In [132]:

    
xg_train_17 = xgb.DMatrix( X_train, label=y_train)
xg_test_17 = xgb.DMatrix(X_test)
param = {}
param['objective'] = 'multi:softmax'
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 4
param['eval_metric'] = 'auc'

num_round = 5
bst_17 = xgb.train(param, xg_train_17, num_round)
# get prediction
pred_17 = bst_17.predict( xg_test_17 );



In [133]:

    
ger_17_predict = ger_17.drop([u'Bayern Munich', u'Borussia Dortmund', u'Borussia Monchengladbach',
       u'Eintracht Frankfurt', u'FC Augsburg', u'FC Cologne',
       u'FC Ingolstadt 04', u'Hamburg SV', u'Hertha Berlin', u'Mainz',
       u'RB Leipzig', u'SC Freiburg', u'SV Darmstadt 98', u'Schalke 04',
       u'TSG Hoffenheim', u'VfL Wolfsburg', u'Werder Bremen'], axis = 1)
ger_17_predict['cluster'] = pred_17



In [149]:

    
ger_17_predict.head()









    Out[149]:






  
    
      
      Team1
      Team2
      date_time
      cluster
    
  
  
    
      0
      Mainz
      FC Augsburg
      10:02. 20:30
      3.0
    
    
      1
      SV Darmstadt 98
      Borussia Dortmund
      11:02. 15:30
      2.0
    
    
      2
      FC Ingolstadt 04
      Bayern Munich
      11:02. 15:30
      2.0
    
    
      3
      Bayer Leverkusen
      Eintracht Frankfurt
      11:02. 15:30
      3.0
    
    
      4
      RB Leipzig
      Hamburg SV
      11:02. 15:30
      2.0



In [ ]:

Italy Below



In [145]:

    
ita_train_17 = dummy_year_pipe(italy, 2017)

ita_17 = pd.read_csv('ita_17_fixtures.csv', encoding = 'utf-8')

ita_17.loc[ita_17.Team1 == 'Chievo', 'Team1'] = 'Chievo Verona'
ita_17.loc[ita_17.Team2 == 'Chievo', 'Team2'] = 'Chievo Verona'
ita_17.loc[ita_17.Team1 == 'inter', 'Team1'] = 'Internazionale'
ita_17.loc[ita_17.Team2 == 'inter', 'Team2'] = 'Internazionale'
ita_17.loc[ita_17.Team1 == 'Pescara', 'Team1'] = 'US Pescara'
ita_17.loc[ita_17.Team2 == 'Pescara', 'Team2'] = 'US Pescara'



In [144]:

    
print ita_train_17.columns
print np.unique(ita_17[['Team1','Team2']].values.ravel())









    



Index([u'AS Roma', u'Atalanta', u'Bologna', u'Cagliari', u'Chievo Verona',
       u'Crotone', u'Empoli', u'Fiorentina', u'Genoa', u'Internazionale',
       u'Juventus', u'Lazio', u'Napoli', u'Palermo', u'Sampdoria', u'Sassuolo',
       u'Torino', u'US Pescara', u'Udinese', u'at', u'avg_goal_diff',
       u'cluster', u'country', u'date', u'ftag', u'fthg', u'ht', u'id_event',
       u'id_odsp', u'late_goals', u'lead_change', u'league', u'month',
       u'season', u'year'],
      dtype='object')
[u'AC Milan' u'AS Roma' u'Atalanta' u'Bologna' u'Cagliari' u'Chievo'
 u'Crotone' u'Empoli' u'Fiorentina' u'Genoa' u'Juventus' u'Lazio' u'Napoli'
 u'Palermo' u'Pescara' u'Sampdoria' u'Sassuolo' u'Torino' u'Udinese'
 u'inter']



In [146]:

    
ita_17_clubs = pd.unique(ita_17[['Team1','Team2']].values.ravel())
    
    ## drop first alphabetical club for comparison, return in DF to append to original df
clubs = np.sort(ita_17_clubs)[1:len(ita_17_clubs)]
club_df = pd.DataFrame(columns = clubs)
    
ita_17 = ita_17.append(club_df)
    
for club in clubs:
    for i, row in ita_17.iterrows():
        if row['Team1'] == club or row['Team2'] == club:
            dummy = 1
        else:
            dummy = 0
        ita_17.loc[i, club] = dummy



In [147]:

    
X_train = ita_train_17[[u'AS Roma', u'Atalanta', u'Bologna', u'Cagliari', u'Chievo Verona',
       u'Crotone', u'Empoli', u'Fiorentina', u'Genoa', u'Internazionale',
       u'Juventus', u'Lazio', u'Napoli', u'Palermo', u'Sampdoria', u'Sassuolo',
       u'Torino', u'US Pescara', u'Udinese']]
y_train = ita_train_17['cluster']
X_test = ita_17[[u'AS Roma', u'Atalanta', u'Bologna', u'Cagliari', u'Chievo Verona',
       u'Crotone', u'Empoli', u'Fiorentina', u'Genoa', u'Internazionale',
       u'Juventus', u'Lazio', u'Napoli', u'Palermo', u'Sampdoria', u'Sassuolo',
       u'Torino', u'US Pescara', u'Udinese']]
## y_test = test['cluster']

X_train = X_train.astype(int)
X_test = X_test.astype(int)



In [148]:

    
xg_train_17 = xgb.DMatrix( X_train, label=y_train)
xg_test_17 = xgb.DMatrix(X_test)
param = {}
param['objective'] = 'multi:softmax'
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 4
param['eval_metric'] = 'auc'

num_round = 5
bst_17 = xgb.train(param, xg_train_17, num_round)
# get prediction
pred_17 = bst_17.predict( xg_test_17 );



In [150]:

    
ita_17_predict = ita_17.drop([u'AS Roma', u'Atalanta', u'Bologna', u'Cagliari', u'Chievo Verona',
       u'Crotone', u'Empoli', u'Fiorentina', u'Genoa', u'Internazionale',
       u'Juventus', u'Lazio', u'Napoli', u'Palermo', u'Sampdoria', u'Sassuolo',
       u'Torino', u'US Pescara', u'Udinese'], axis = 1)
ita_17_predict['cluster'] = pred_17

ita_17_predict.head()









    Out[150]:






  
    
      
      Team1
      Team2
      date_time
      cluster
    
  
  
    
      0
      Napoli
      Genoa
      10:02. 20:45
      0.0
    
    
      1
      Fiorentina
      Udinese
      11:02. 20:45
      1.0
    
    
      2
      Crotone
      AS Roma
      12:02. 12:30
      2.0
    
    
      3
      Internazionale
      Empoli
      12:02. 15:00
      2.0
    
    
      4
      Palermo
      Atalanta
      12:02. 15:00
      2.0

France below



In [153]:

    
fra_train_17 = dummy_year_pipe(france, 2017)

fra_17 = pd.read_csv('fra_17_fixtures.csv', encoding = 'utf-8')

fra_17.loc[fra_17.Team1 == 'Nancy', 'Team1'] = 'AS Nancy Lorraine'
fra_17.loc[fra_17.Team2 == 'Nancy', 'Team2'] = 'AS Nancy Lorraine'
fra_17.loc[fra_17.Team1 == 'Dijon', 'Team1'] = 'Dijon FCO'
fra_17.loc[fra_17.Team2 == 'Dijon', 'Team2'] = 'Dijon FCO'
fra_17.loc[fra_17.Team1 == 'PSG', 'Team1'] = 'Paris Saint-Germain'
fra_17.loc[fra_17.Team2 == 'PSG', 'Team2'] = 'Paris Saint-Germain'
fra_17.loc[fra_17.Team1 == 'Rennes', 'Team1'] = 'Stade Rennes'
fra_17.loc[fra_17.Team2 == 'Rennes', 'Team2'] = 'Stade Rennes'
fra_17.loc[fra_17.Team1 == 'St. Etienne', 'Team1'] = 'St Etienne'
fra_17.loc[fra_17.Team2 == 'St. Etienne', 'Team2'] = 'St Etienne'
fra_17.loc[fra_17.Team1 == 'Monaco', 'Team1'] = 'AS Monaco'
fra_17.loc[fra_17.Team2 == 'Monaco', 'Team2'] = 'AS Monaco'

print fra_train_17.columns
print np.unique(fra_17[['Team1','Team2']].values.ravel())









    



Index([u'AS Nancy Lorraine', u'Angers', u'Bastia', u'Bordeaux', u'Caen',
       u'Dijon FCO', u'Guingamp', u'Lille', u'Lorient', u'Lyon', u'Marseille',
       u'Metz', u'Montpellier', u'Nantes', u'Nice', u'Paris Saint-Germain',
       u'St Etienne', u'Stade Rennes', u'Toulouse', u'at', u'avg_goal_diff',
       u'cluster', u'country', u'date', u'ftag', u'fthg', u'ht', u'id_event',
       u'id_odsp', u'late_goals', u'lead_change', u'league', u'month',
       u'season', u'year'],
      dtype='object')
['AS Monaco' 'AS Nancy Lorraine' u'Angers' u'Bastia' u'Bordeaux' u'Caen'
 'Dijon FCO' u'Guingamp' u'Lille' u'Lorient' u'Lyon' u'Marseille' u'Metz'
 u'Montpellier' u'Nantes' u'Nice' 'Paris Saint-Germain' 'St Etienne'
 'Stade Rennes' u'Toulouse']



In [154]:

    
fra_17_clubs = pd.unique(fra_17[['Team1','Team2']].values.ravel())
    
    ## drop first alphabetical club for comparison, return in DF to append to original df
clubs = np.sort(fra_17_clubs)[1:len(fra_17_clubs)]
club_df = pd.DataFrame(columns = clubs)
    
fra_17 = fra_17.append(club_df)
    
for club in clubs:
    for i, row in fra_17.iterrows():
        if row['Team1'] == club or row['Team2'] == club:
            dummy = 1
        else:
            dummy = 0
        fra_17.loc[i, club] = dummy



In [155]:

    
X_train = fra_train_17[[u'AS Nancy Lorraine', u'Angers', u'Bastia', u'Bordeaux', u'Caen',
       u'Dijon FCO', u'Guingamp', u'Lille', u'Lorient', u'Lyon', u'Marseille',
       u'Metz', u'Montpellier', u'Nantes', u'Nice', u'Paris Saint-Germain',
       u'St Etienne', u'Stade Rennes', u'Toulouse']]
y_train = fra_train_17['cluster']
X_test = fra_17[[u'AS Nancy Lorraine', u'Angers', u'Bastia', u'Bordeaux', u'Caen',
       u'Dijon FCO', u'Guingamp', u'Lille', u'Lorient', u'Lyon', u'Marseille',
       u'Metz', u'Montpellier', u'Nantes', u'Nice', u'Paris Saint-Germain',
       u'St Etienne', u'Stade Rennes', u'Toulouse']]
## y_test = test['cluster']

X_train = X_train.astype(int)
X_test = X_test.astype(int)



In [156]:

    
xg_train_17 = xgb.DMatrix( X_train, label=y_train)
xg_test_17 = xgb.DMatrix(X_test)
param = {}
param['objective'] = 'multi:softmax'
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 4
param['eval_metric'] = 'auc'

num_round = 5
bst_17 = xgb.train(param, xg_train_17, num_round)
# get prediction
pred_17 = bst_17.predict( xg_test_17 );



In [157]:

    
fra_17_predict = fra_17.drop([u'AS Nancy Lorraine', u'Angers', u'Bastia', u'Bordeaux', u'Caen',
       u'Dijon FCO', u'Guingamp', u'Lille', u'Lorient', u'Lyon', u'Marseille',
       u'Metz', u'Montpellier', u'Nantes', u'Nice', u'Paris Saint-Germain',
       u'St Etienne', u'Stade Rennes', u'Toulouse'], axis = 1)
fra_17_predict['cluster'] = pred_17

fra_17_predict.head()









    Out[157]:






  
    
      
      Team1
      Team2
      date_time
      cluster
    
  
  
    
      0
      Bordeaux
      Paris Saint-Germain
      10:02. 20:45
      3.0
    
    
      1
      Guingamp
      Lyon
      11:02. 17:00
      2.0
    
    
      2
      Dijon FCO
      Caen
      11:02. 8:00 pm
      1.0
    
    
      3
      Lille
      Angers
      11:02. 8:00 pm
      3.0
    
    
      4
      AS Monaco
      Metz
      11:02. 8:00 pm
      3.0



In [199]:

    
print epl_17_predict.head()
print esp_17_predict.head()
print ger_17_predict.head()
print ita_17_predict.head()
print fra_17_predict.head()









    



            Team1           Team2     date_time  cluster
0         Arsenal            Hull  11:02. 13:30      3.0
1  Manchester Utd         Watford  11:02. 16:00      3.0
2   Middlesbrough         Everton  11:02. 16:00      3.0
3      Stoke City  Crystal Palace  11:02. 16:00      1.0
4      Sunderland     Southampton  11:02. 16:00      3.0
             Team1                Team2     date_time  cluster
0         Espanyol        Real Sociedad  10.02. 20:45      3.0
1       Real Betis             Valencia  11.02. 13:00      1.0
2           Alaves            Barcelona  11.02. 16:15      0.0
3  Athletic Bilbao  Deportivo La Coruna  11.02. 18:30      1.0
4          Osasuna          Real Madrid  11.02. 20:45      0.0
              Team1                Team2     date_time  cluster
0             Mainz          FC Augsburg  10:02. 20:30      3.0
1   SV Darmstadt 98    Borussia Dortmund  11:02. 15:30      2.0
2  FC Ingolstadt 04        Bayern Munich  11:02. 15:30      2.0
3  Bayer Leverkusen  Eintracht Frankfurt  11:02. 15:30      3.0
4        RB Leipzig           Hamburg SV  11:02. 15:30      2.0
            Team1     Team2     date_time  cluster
0          Napoli     Genoa  10:02. 20:45      0.0
1      Fiorentina   Udinese  11:02. 20:45      1.0
2         Crotone   AS Roma  12:02. 12:30      2.0
3  Internazionale    Empoli  12:02. 15:00      2.0
4         Palermo  Atalanta  12:02. 15:00      2.0
       Team1                Team2       date_time  cluster
0   Bordeaux  Paris Saint-Germain    10:02. 20:45      3.0
1   Guingamp                 Lyon    11:02. 17:00      2.0
2  Dijon FCO                 Caen  11:02. 8:00 pm      1.0
3      Lille               Angers  11:02. 8:00 pm      3.0
4  AS Monaco                 Metz  11:02. 8:00 pm      3.0

Dates are in an inconsistent format, and the below should pull the date of each match.



In [217]:

    
def dt_split(i):
    splt = i.split('. ', 1)
    date = splt[0]
    time = splt[1]
    try:
        dt = date.split(':', 1)
        return pd.to_datetime("{}-{}-2017".format(dt[1], dt[0])) # dt[1],"/",dt[0],"/2017"
    except:
        dt = date.split('.', 1)
        return pd.to_datetime("{}-{}-2017".format(dt[1], dt[0]))



In [219]:

    
epl_17_predict['date'] = epl_17_predict.date_time.apply(dt_split)



In [221]:

    
epl_17_predict.to_csv('epl_matches_2017.csv', encoding='utf-8')



In [222]:

    
esp_17_predict['date'] = esp_17_predict.date_time.apply(dt_split)
esp_17_predict.to_csv('esp_matches_2017.csv', encoding='utf-8')



In [223]:

    
ger_17_predict['date'] = ger_17_predict.date_time.apply(dt_split)
ger_17_predict.to_csv('ger_matches_2017.csv', encoding='utf-8')



In [224]:

    
ita_17_predict['date'] = ita_17_predict.date_time.apply(dt_split)
ita_17_predict.to_csv('ita_matches_2017.csv', encoding='utf-8')



In [225]:

    
fra_17_predict['date'] = fra_17_predict.date_time.apply(dt_split)
fra_17_predict.to_csv('fra_matches_2017.csv', encoding='utf-8')



In [274]:

    
epl_17_predict['country'] = 'England'
esp_17_predict['country'] = 'Spain'
ger_17_predict['country'] = 'Germany'
ita_17_predict['country'] = 'Italy'
fra_17_predict['country'] = 'France'

epl_17_predict.append([esp_17_predict, ger_17_predict, ita_17_predict, fra_17_predict]).to_csv('league_compare.csv', encoding = 'utf-8')



In [250]:

    
print epl_17_predict.groupby('cluster').date.count().reset_index()
print esp_17_predict.groupby('cluster').date.count().reset_index()
print ger_17_predict.groupby('cluster').date.count().reset_index()
print ita_17_predict.groupby('cluster').date.count().reset_index()
print fra_17_predict.groupby('cluster').date.count().reset_index()









    



   cluster  date
0      0.0    17
1      1.0    19
2      2.0     2
3      3.0   102
   cluster  date
0      0.0    34
1      1.0    41
2      2.0    26
3      3.0    69
   cluster  date
0      0.0     2
1      1.0    15
2      2.0    61
3      3.0    57
   cluster  date
0      0.0    30
1      1.0    33
2      2.0    63
3      3.0    24
   cluster  date
0      1.0    13
1      2.0    14
2      3.0   114



In [262]:

    
league_clust = pd.merge(epl_17_predict.groupby('cluster').date.count().reset_index(), 
         esp_17_predict.groupby('cluster').date.count().reset_index(), on = 'cluster', how = 'left')
league_clust = pd.merge(league_clust, 
         ger_17_predict.groupby('cluster').date.count().reset_index(), on = 'cluster', how = 'left')
league_clust = pd.merge(league_clust, 
         ita_17_predict.groupby('cluster').date.count().reset_index(), on = 'cluster', how = 'left')
league_clust = pd.merge(league_clust, 
         fra_17_predict.groupby('cluster').date.count().reset_index(), on = 'cluster', how = 'left')



In [265]:

    
league_clust.columns = [['cluster','England','Spain','Germany','Italy','France']]
league_clust.fillna(0, inplace = True)
league_clust

Predictions of excitement of matches are plotted below.



In [268]:

    
league_clust.groupby('cluster').max().plot.bar()
plt.show()



In [243]:

    
#epl_17_predict[['Team1','Team2']].stack()
pd.melt(epl_17_predict, id_vars = ['cluster','date'], value_vars = ['Team1','Team2'], value_name = 'team')









    Out[243]:






  
    
      
      cluster
      date
      variable
      team
    
  
  
    
      0
      3.0
      2017-02-11
      Team1
      Arsenal
    
    
      1
      3.0
      2017-02-11
      Team1
      Manchester Utd
    
    
      2
      3.0
      2017-02-11
      Team1
      Middlesbrough
    
    
      3
      1.0
      2017-02-11
      Team1
      Stoke City
    
    
      4
      3.0
      2017-02-11
      Team1
      Sunderland
    
    
      5
      3.0
      2017-02-11
      Team1
      West Ham
    
    
      6
      0.0
      2017-02-11
      Team1
      Liverpool
    
    
      7
      0.0
      2017-02-12
      Team1
      Burnley
    
    
      8
      3.0
      2017-02-12
      Team1
      Swansea
    
    
      9
      3.0
      2017-02-13
      Team1
      Bournemouth
    
    
      10
      0.0
      2017-02-25
      Team1
      Chelsea
    
    
      11
      1.0
      2017-02-25
      Team1
      Crystal Palace
    
    
      12
      3.0
      2017-02-25
      Team1
      Everton
    
    
      13
      3.0
      2017-02-25
      Team1
      Hull
    
    
      14
      3.0
      2017-02-25
      Team1
      Southampton
    
    
      15
      3.0
      2017-02-25
      Team1
      West Brom
    
    
      16
      3.0
      2017-02-25
      Team1
      Watford
    
    
      17
      0.0
      2017-02-26
      Team1
      Tottenham
    
    
      18
      3.0
      2017-02-26
      Team1
      Manchester City
    
    
      19
      3.0
      2017-02-27
      Team1
      Leicester City
    
    
      20
      3.0
      2017-03-04
      Team1
      Manchester Utd
    
    
      21
      3.0
      2017-03-04
      Team1
      Leicester City
    
    
      22
      3.0
      2017-03-04
      Team1
      Stoke City
    
    
      23
      3.0
      2017-03-04
      Team1
      Swansea
    
    
      24
      3.0
      2017-03-04
      Team1
      Watford
    
    
      25
      3.0
      2017-03-04
      Team1
      West Brom
    
    
      26
      3.0
      2017-03-04
      Team1
      Liverpool
    
    
      27
      1.0
      2017-03-05
      Team1
      Tottenham
    
    
      28
      3.0
      2017-03-05
      Team1
      Sunderland
    
    
      29
      1.0
      2017-03-06
      Team1
      West Ham
    
    
      ...
      ...
      ...
      ...
      ...
    
    
      250
      3.0
      2017-05-06
      Team2
      Manchester Utd
    
    
      251
      3.0
      2017-05-06
      Team2
      Stoke City
    
    
      252
      3.0
      2017-05-06
      Team2
      West Brom
    
    
      253
      0.0
      2017-05-06
      Team2
      Middlesbrough
    
    
      254
      3.0
      2017-05-06
      Team2
      Sunderland
    
    
      255
      3.0
      2017-05-06
      Team2
      Watford
    
    
      256
      3.0
      2017-05-06
      Team2
      Southampton
    
    
      257
      1.0
      2017-05-06
      Team2
      Crystal Palace
    
    
      258
      3.0
      2017-05-06
      Team2
      Everton
    
    
      259
      2.0
      2017-05-06
      Team2
      Tottenham
    
    
      260
      3.0
      2017-05-13
      Team2
      Burnley
    
    
      261
      1.0
      2017-05-13
      Team2
      Hull
    
    
      262
      3.0
      2017-05-13
      Team2
      Watford
    
    
      263
      3.0
      2017-05-13
      Team2
      Leicester City
    
    
      264
      3.0
      2017-05-13
      Team2
      Southampton
    
    
      265
      3.0
      2017-05-13
      Team2
      Arsenal
    
    
      266
      3.0
      2017-05-13
      Team2
      Swansea
    
    
      267
      2.0
      2017-05-13
      Team2
      Manchester Utd
    
    
      268
      3.0
      2017-05-13
      Team2
      Chelsea
    
    
      269
      3.0
      2017-05-13
      Team2
      Liverpool
    
    
      270
      3.0
      2017-05-21
      Team2
      Everton
    
    
      271
      3.0
      2017-05-21
      Team2
      West Ham
    
    
      272
      0.0
      2017-05-21
      Team2
      Sunderland
    
    
      273
      0.0
      2017-05-21
      Team2
      Tottenham
    
    
      274
      3.0
      2017-05-21
      Team2
      Bournemouth
    
    
      275
      3.0
      2017-05-21
      Team2
      Middlesbrough
    
    
      276
      1.0
      2017-05-21
      Team2
      Crystal Palace
    
    
      277
      3.0
      2017-05-21
      Team2
      Stoke City
    
    
      278
      3.0
      2017-05-21
      Team2
      West Brom
    
    
      279
      3.0
      2017-05-21
      Team2
      Manchester City
    
  

280 rows × 4 columns



In [ ]:

	id_odsp	date	league	season	country	ht	at	fthg	ftag	lead_change	late_goals	id_event	avg_goal_diff	cluster	month	year
0	UFot0hit/	2011-08-05	D1	2012	germany	Borussia Dortmund	Hamburg SV	3	1	1.0	1.0	110	1.835165	0	8	2011
1	Aw5DflLH/	2011-08-06	D1	2012	germany	FC Augsburg	SC Freiburg	2	2	4.0	1.0	122	0.340659	1	8	2011
2	bkjpaC6n/	2011-08-06	D1	2012	germany	Werder Bremen	Kaiserslautern	2	0	1.0	1.0	137	0.450549	2	8	2011
3	CzPV312a/	2011-08-06	F1	2012	france	Paris Saint-Germain	Lorient	0	1	1.0	0.0	114	0.692308	2	8	2011
4	GUOdmtII/	2011-08-06	F1	2012	france	Caen	Valenciennes	1	0	1.0	0.0	90	0.626374	3	8	2011

	id_odsp	date	league	season	country	ht	at	fthg	ftag	lead_change	late_goals	id_event	avg_goal_diff	cluster	month	year
3062	foAjBseC/	2013-10-05	E0	2014	england	Hull	Aston Villa	0	0	0.0	0.0	80	0.000000	3	10	2013
3063	GfBnCNu6/	2013-10-05	E0	2014	england	Fulham	Stoke City	1	0	1.0	1.0	104	0.136842	3	10	2013
3067	j7h8TOAt/	2013-10-05	E0	2014	england	Sunderland	Manchester Utd	1	2	3.0	0.0	94	0.824176	1	10	2013
3068	l89fA1AI/	2013-10-05	E0	2014	england	Liverpool	Crystal Palace	3	1	1.0	0.0	81	2.105263	0	10	2013
3069	ngSc9LPO/	2013-10-05	E0	2014	england	Manchester City	Everton	3	1	3.0	0.0	97	0.831579	1	10	2013

	id_odsp	date	league	season	country	ht	at	fthg	ftag	lead_change	late_goals	id_event	avg_goal_diff	cluster	month	year
59	CtEdUSdJ/	2011-08-27	SP1	2012	spain	Sporting Gijon	Real Sociedad	1	2	1.0	0.0	99	0.648352	3	8	2011
63	jRPmWlS6/	2011-08-27	SP1	2012	spain	Valencia	Racing Santander	4	3	5.0	2.0	103	0.890110	1	8	2011
73	C88uY13B/	2011-08-28	SP1	2012	spain	Real Zaragoza	Real Madrid	0	6	1.0	2.0	137	2.087912	0	8	2011
76	tvgGyp4n/	2011-08-28	SP1	2012	spain	Sevilla	Malaga	2	1	1.0	1.0	85	1.582418	0	8	2011
77	UaoTYsl5/	2011-08-28	SP1	2012	spain	Athletic Bilbao	Rayo Vallecano	1	1	2.0	0.0	117	0.065934	2	8	2011

	id_odsp	date	league	season	country	ht	at	fthg	ftag	lead_change	late_goals	id_event	avg_goal_diff	cluster	month	year
84	W0scuwXc/	2011-09-09	I1	2012	italy	AC Milan	Lazio	2	2	2.0	0.0	111	0.318681	2	9	2011
102	zesOOHnH/	2011-09-10	I1	2012	italy	Cesena	Napoli	1	3	3.0	1.0	104	0.538462	1	9	2011
105	Ait1vcn4/	2011-09-11	I1	2012	italy	Palermo	Internazionale	4	3	5.0	3.0	118	0.274725	1	9	2011
107	EotSNy2N/	2011-09-11	I1	2012	italy	Chievo Verona	Novara	2	2	2.0	1.0	122	0.912088	2	9	2011
110	jiZlsa2o/	2011-09-11	I1	2012	italy	Juventus	Parma	4	1	1.0	2.0	86	1.461538	0	9	2011

	Team1	Team2	date_time	cluster
0	Arsenal	Hull	11:02. 13:30	3.0
1	Manchester Utd	Watford	11:02. 16:00	3.0
2	Middlesbrough	Everton	11:02. 16:00	3.0
3	Stoke City	Crystal Palace	11:02. 16:00	1.0
4	Sunderland	Southampton	11:02. 16:00	3.0
5	West Ham	West Brom	11:02. 16:00	3.0
6	Liverpool	Tottenham	11:02. 18:30	0.0
7	Burnley	Chelsea	12:02. 14:30	0.0
8	Swansea	Leicester City	12:02. 17:00	3.0
9	Bournemouth	Manchester City	13:02. 21:00	3.0
10	Chelsea	Swansea	25.02. 16:00	0.0
11	Crystal Palace	Middlesbrough	25.02. 16:00	1.0
12	Everton	Sunderland	25.02. 16:00	3.0
13	Hull	Burnley	25.02. 16:00	3.0
14	Southampton	Arsenal	25.02. 16:00	3.0
15	West Brom	Bournemouth	25.02. 16:00	3.0
16	Watford	West Ham	25.02. 18:30	3.0
17	Tottenham	Stoke City	26.02. 14:30	0.0
18	Manchester City	Manchester Utd	26.02. 15:15	3.0
19	Leicester City	Liverpool	27.02. 21:00	3.0
20	Manchester Utd	Bournemouth	04.03. 13:30	3.0
21	Leicester City	Hull	04.03. 16:00	3.0
22	Stoke City	Middlesbrough	04.03. 16:00	3.0
23	Swansea	Burnley	04.03. 16:00	3.0
24	Watford	Southampton	04.03. 16:00	3.0
25	West Brom	Crystal Palace	04.03. 16:00	3.0
26	Liverpool	Arsenal	04.03. 18:30	3.0
27	Tottenham	Everton	05.03. 14:30	1.0
28	Sunderland	Manchester City	05.03. 17:00	3.0
29	West Ham	Chelsea	06.03. 21:00	1.0
...	...	...	...	...
110	Arsenal	Manchester Utd	06.05. 16:00	3.0
111	Bournemouth	Stoke City	06.05. 16:00	3.0
112	Burnley	West Brom	06.05. 16:00	3.0
113	Chelsea	Middlesbrough	06.05. 16:00	0.0
114	Hull	Sunderland	06.05. 16:00	3.0
115	Leicester City	Watford	06.05. 16:00	3.0
116	Liverpool	Southampton	06.05. 16:00	3.0
117	Manchester City	Crystal Palace	06.05. 16:00	1.0
118	Swansea	Everton	06.05. 16:00	3.0
119	West Ham	Tottenham	06.05. 16:00	2.0
120	Bournemouth	Burnley	13.05. 16:00	3.0
121	Crystal Palace	Hull	13.05. 16:00	1.0
122	Everton	Watford	13.05. 16:00	3.0
123	Manchester City	Leicester City	13.05. 16:00	3.0
124	Middlesbrough	Southampton	13.05. 16:00	3.0
125	Stoke City	Arsenal	13.05. 16:00	3.0
126	Sunderland	Swansea	13.05. 16:00	3.0
127	Tottenham	Manchester Utd	13.05. 16:00	2.0
128	West Brom	Chelsea	13.05. 16:00	3.0
129	West Ham	Liverpool	13.05. 16:00	3.0
130	Arsenal	Everton	21.05. 16:00	3.0
131	Burnley	West Ham	21.05. 16:00	3.0
132	Chelsea	Sunderland	21.05. 16:00	0.0
133	Hull	Tottenham	21.05. 16:00	0.0
134	Leicester City	Bournemouth	21.05. 16:00	3.0
135	Liverpool	Middlesbrough	21.05. 16:00	3.0
136	Manchester Utd	Crystal Palace	21.05. 16:00	1.0
137	Southampton	Stoke City	21.05. 16:00	3.0
138	Swansea	West Brom	21.05. 16:00	3.0
139	Watford	Manchester City	21.05. 16:00	3.0

	Team1	Team2	date_time	cluster
0	Espanyol	Real Sociedad	10.02. 20:45	3.0
1	Real Betis	Valencia	11.02. 13:00	1.0
2	Alaves	Barcelona	11.02. 16:15	0.0
3	Athletic Bilbao	Deportivo La Coruna	11.02. 18:30	1.0
4	Osasuna	Real Madrid	11.02. 20:45	0.0

	Team1	Team2	date_time	cluster
0	Mainz	FC Augsburg	10:02. 20:30	3.0
1	SV Darmstadt 98	Borussia Dortmund	11:02. 15:30	2.0
2	FC Ingolstadt 04	Bayern Munich	11:02. 15:30	2.0
3	Bayer Leverkusen	Eintracht Frankfurt	11:02. 15:30	3.0
4	RB Leipzig	Hamburg SV	11:02. 15:30	2.0

	Team1	Team2	date_time	cluster
0	Napoli	Genoa	10:02. 20:45	0.0
1	Fiorentina	Udinese	11:02. 20:45	1.0
2	Crotone	AS Roma	12:02. 12:30	2.0
3	Internazionale	Empoli	12:02. 15:00	2.0
4	Palermo	Atalanta	12:02. 15:00	2.0

	Team1	Team2	date_time	cluster
0	Bordeaux	Paris Saint-Germain	10:02. 20:45	3.0
1	Guingamp	Lyon	11:02. 17:00	2.0
2	Dijon FCO	Caen	11:02. 8:00 pm	1.0
3	Lille	Angers	11:02. 8:00 pm	3.0
4	AS Monaco	Metz	11:02. 8:00 pm	3.0

	cluster	England	Spain	Germany	Italy	France
0	0.0	17	34	2	30	0.0
1	1.0	19	41	15	33	13.0
2	2.0	2	26	61	63	14.0
3	3.0	102	69	57	24	114.0

	cluster	date	variable	team
0	3.0	2017-02-11	Team1	Arsenal
1	3.0	2017-02-11	Team1	Manchester Utd
2	3.0	2017-02-11	Team1	Middlesbrough
3	1.0	2017-02-11	Team1	Stoke City
4	3.0	2017-02-11	Team1	Sunderland
5	3.0	2017-02-11	Team1	West Ham
6	0.0	2017-02-11	Team1	Liverpool
7	0.0	2017-02-12	Team1	Burnley
8	3.0	2017-02-12	Team1	Swansea
9	3.0	2017-02-13	Team1	Bournemouth
10	0.0	2017-02-25	Team1	Chelsea
11	1.0	2017-02-25	Team1	Crystal Palace
12	3.0	2017-02-25	Team1	Everton
13	3.0	2017-02-25	Team1	Hull
14	3.0	2017-02-25	Team1	Southampton
15	3.0	2017-02-25	Team1	West Brom
16	3.0	2017-02-25	Team1	Watford
17	0.0	2017-02-26	Team1	Tottenham
18	3.0	2017-02-26	Team1	Manchester City
19	3.0	2017-02-27	Team1	Leicester City
20	3.0	2017-03-04	Team1	Manchester Utd
21	3.0	2017-03-04	Team1	Leicester City
22	3.0	2017-03-04	Team1	Stoke City
23	3.0	2017-03-04	Team1	Swansea
24	3.0	2017-03-04	Team1	Watford
25	3.0	2017-03-04	Team1	West Brom
26	3.0	2017-03-04	Team1	Liverpool
27	1.0	2017-03-05	Team1	Tottenham
28	3.0	2017-03-05	Team1	Sunderland
29	1.0	2017-03-06	Team1	West Ham
...	...	...	...	...
250	3.0	2017-05-06	Team2	Manchester Utd
251	3.0	2017-05-06	Team2	Stoke City
252	3.0	2017-05-06	Team2	West Brom
253	0.0	2017-05-06	Team2	Middlesbrough
254	3.0	2017-05-06	Team2	Sunderland
255	3.0	2017-05-06	Team2	Watford
256	3.0	2017-05-06	Team2	Southampton
257	1.0	2017-05-06	Team2	Crystal Palace
258	3.0	2017-05-06	Team2	Everton
259	2.0	2017-05-06	Team2	Tottenham
260	3.0	2017-05-13	Team2	Burnley
261	1.0	2017-05-13	Team2	Hull
262	3.0	2017-05-13	Team2	Watford
263	3.0	2017-05-13	Team2	Leicester City
264	3.0	2017-05-13	Team2	Southampton
265	3.0	2017-05-13	Team2	Arsenal
266	3.0	2017-05-13	Team2	Swansea
267	2.0	2017-05-13	Team2	Manchester Utd
268	3.0	2017-05-13	Team2	Chelsea
269	3.0	2017-05-13	Team2	Liverpool
270	3.0	2017-05-21	Team2	Everton
271	3.0	2017-05-21	Team2	West Ham
272	0.0	2017-05-21	Team2	Sunderland
273	0.0	2017-05-21	Team2	Tottenham
274	3.0	2017-05-21	Team2	Bournemouth
275	3.0	2017-05-21	Team2	Middlesbrough
276	1.0	2017-05-21	Team2	Crystal Palace
277	3.0	2017-05-21	Team2	Stoke City
278	3.0	2017-05-21	Team2	West Brom
279	3.0	2017-05-21	Team2	Manchester City