notebook.community

Edit and run



In [1]:

    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3
import numpy as np
from numpy import random
database = 'database.sqlite'
conn = sqlite3.connect(database)



In [2]:

    
uk = pd.read_csv('./leagues/1729.csv',index_col='Unnamed: 0')
de = pd.read_csv('./leagues/7809.csv',index_col='Unnamed: 0')
fr = pd.read_csv('./leagues/4769.csv',index_col='Unnamed: 0')
it = pd.read_csv('./leagues/10257.csv',index_col='Unnamed: 0')
es = pd.read_csv('./leagues/21518.csv',index_col='Unnamed: 0')
pt = pd.read_csv('./leagues/17642.csv',index_col='Unnamed: 0')

data= pd.concat([uk,de])
data1= pd.concat([es,fr])
data= pd.concat([data,it])
data1= pd.concat([data1,pt])
data= pd.concat([data,data1])



In [3]:

    
ft = ['home_V','home_D','home_E','home_GF','home_AVG_GF','home_GS','home_AVG_GS','home_VG','home_DG','home_EG','home_GFG','home_AVG_GFG','home_GSG','home_AVG_GSG','away_V','away_D','away_E','away_GF','away_AVG_GF','away_GS','away_AVG_GS','away_VG','away_DG','away_EG','away_GFG','away_AVG_GFG','away_GSG','away_AVG_GSG','h_buildUpPlaySpeed','h_buildUpPlaySpeedClass','h_buildUpPlayDribblingClass','h_buildUpPlayPassing','h_buildUpPlayPassingClass','h_buildUpPlayPositioningClass','h_chanceCreationPassing','h_chanceCreationPassingClass','h_chanceCreationCrossing','h_chanceCreationCrossingClass','h_chanceCreationShooting','h_chanceCreationShootingClass','h_chanceCreationPositioningClass','h_defencePressure','h_defencePressureClass','h_defenceAggression','h_defenceAggressionClass','h_defenceTeamWidth','h_defenceTeamWidthClass','h_defenceDefenderLineClass','a_buildUpPlaySpeed','a_buildUpPlaySpeedClass','a_buildUpPlayDribblingClass','a_buildUpPlayPassing','a_buildUpPlayPassingClass','a_buildUpPlayPositioningClass','a_chanceCreationPassing','a_chanceCreationPassingClass','a_chanceCreationCrossing','a_chanceCreationCrossingClass','a_chanceCreationShooting','a_chanceCreationShootingClass','a_chanceCreationPositioningClass','a_defencePressure','a_defencePressureClass','a_defenceAggression','a_defenceAggressionClass','a_defenceTeamWidth','a_defenceTeamWidthClass','a_defenceDefenderLineClass','h_avg_height','h_avg_weight','a_avg_height','a_avg_weight','h_overall','h_potential','h_def','h_mid','h_att','a_overall','a_potential','a_def','a_mid','a_att','a_date','h_date','id','season','stage','home_team_api_id','away_team_api_id','B365H','B365D','B365A','formation_h','formation_a','league_id'] 

#for column in ft:
#    data[column]=(data[column] - data[column].mean()) / data[column].std()



In [4]:

    
data









    Out[4]:






  
    
      
      home_V
      home_D
      home_E
      home_GF
      home_AVG_GF
      home_GS
      home_AVG_GS
      home_VG
      home_DG
      home_EG
      ...
      season
      stage
      home_team_api_id
      away_team_api_id
      B365H
      B365D
      B365A
      formation_h
      formation_a
      Output
    
  
  
    
      0
      1
      1
      0
      4
      2.000000
      4
      2.000000
      5
      2
      2
      ...
      0
      10
      9825
      8654
      1.17
      7.00
      17.00
      8
      5
      H
    
    
      0
      0
      0
      2
      1
      0.500000
      1
      0.500000
      3
      4
      2
      ...
      0
      10
      10252
      8658
      1.80
      3.60
      4.50
      13
      13
      D
    
    
      0
      1
      1
      0
      2
      1.000000
      1
      0.500000
      2
      3
      4
      ...
      0
      10
      8668
      10194
      1.57
      3.75
      6.50
      13
      12
      H
    
    
      0
      0
      1
      1
      3
      1.500000
      4
      2.000000
      3
      4
      2
      ...
      0
      10
      10261
      8472
      2.20
      3.20
      3.50
      12
      8
      H
    
    
      0
      0
      1
      1
      2
      1.000000
      3
      1.500000
      1
      5
      3
      ...
      0
      10
      8602
      8456
      5.50
      3.40
      1.73
      13
      13
      H
    
    
      0
      0
      0
      2
      1
      0.500000
      1
      0.500000
      2
      4
      3
      ...
      0
      10
      8655
      8455
      7.50
      4.33
      1.44
      13
      11
      A
    
    
      0
      0
      1
      1
      1
      0.500000
      2
      1.000000
      1
      2
      6
      ...
      0
      10
      9879
      8528
      1.91
      3.40
      4.20
      13
      12
      H
    
    
      0
      1
      0
      1
      4
      2.000000
      2
      1.000000
      5
      0
      5
      ...
      0
      11
      10260
      8602
      1.22
      6.00
      15.00
      13
      13
      H
    
    
      0
      2
      1
      0
      5
      1.666667
      4
      1.333333
      6
      2
      2
      ...
      0
      11
      9825
      10261
      1.29
      5.50
      11.00
      11
      12
      A
    
    
      0
      1
      1
      1
      5
      1.666667
      5
      1.666667
      3
      4
      3
      ...
      0
      11
      8650
      8455
      3.80
      3.40
      2.00
      12
      11
      H
    
    
      0
      1
      0
      1
      1
      0.500000
      0
      0.000000
      2
      2
      6
      ...
      0
      11
      8472
      10194
      2.10
      3.20
      3.75
      12
      12
      H
    
    
      0
      0
      1
      1
      1
      0.500000
      2
      1.000000
      2
      5
      3
      ...
      0
      11
      8655
      8528
      1.80
      3.60
      4.50
      5
      13
      H
    
    
      0
      1
      1
      1
      3
      1.000000
      2
      0.666667
      2
      2
      6
      ...
      0
      11
      9879
      10252
      2.20
      3.25
      3.40
      13
      13
      D
    
    
      0
      1
      1
      1
      2
      0.666667
      2
      0.666667
      2
      3
      5
      ...
      0
      11
      8658
      8654
      1.91
      3.40
      4.20
      12
      11
      D
    
    
      0
      1
      2
      0
      5
      1.666667
      6
      2.000000
      4
      5
      1
      ...
      0
      11
      8483
      8668
      4.50
      3.60
      1.80
      11
      13
      D
    
    
      0
      1
      0
      1
      3
      1.500000
      2
      1.000000
      4
      4
      3
      ...
      0
      12
      8586
      8472
      1.53
      4.00
      6.50
      13
      13
      D
    
    
      0
      0
      1
      1
      2
      1.000000
      3
      1.500000
      1
      6
      4
      ...
      0
      12
      8654
      8659
      2.30
      3.25
      3.20
      12
      13
      D
    
    
      0
      1
      0
      1
      3
      1.500000
      2
      1.000000
      2
      6
      3
      ...
      0
      12
      8602
      9825
      5.50
      3.60
      1.67
      13
      8
      A
    
    
      0
      1
      1
      0
      2
      1.000000
      2
      1.000000
      3
      7
      1
      ...
      0
      12
      10194
      8658
      2.10
      3.20
      3.75
      12
      12
      H
    
    
      0
      1
      0
      1
      3
      1.500000
      1
      0.500000
      2
      5
      4
      ...
      0
      12
      8528
      8650
      4.75
      3.60
      1.75
      13
      12
      D
    
    
      0
      1
      0
      1
      7
      3.500000
      3
      1.500000
      5
      4
      2
      ...
      0
      12
      10261
      8655
      1.91
      3.40
      4.20
      12
      1
      A
    
    
      0
      2
      0
      0
      4
      2.000000
      0
      0.000000
      8
      2
      1
      ...
      0
      12
      8455
      9879
      1.20
      6.50
      15.00
      11
      13
      H
    
    
      0
      0
      0
      2
      2
      1.000000
      2
      1.000000
      4
      4
      4
      ...
      0
      13
      8586
      8655
      1.57
      3.80
      6.50
      12
      15
      H
    
    
      0
      0
      1
      1
      3
      1.500000
      4
      2.000000
      1
      6
      5
      ...
      0
      13
      8654
      8483
      1.91
      3.60
      4.00
      12
      8
      D
    
    
      0
      1
      0
      2
      3
      1.000000
      2
      0.666667
      4
      4
      4
      ...
      0
      13
      10252
      10260
      4.50
      3.60
      1.80
      13
      12
      D
    
    
      0
      2
      0
      1
      4
      1.333333
      1
      0.333333
      3
      3
      6
      ...
      0
      13
      8668
      9825
      3.40
      3.30
      2.20
      13
      8
      A
    
    
      0
      1
      1
      1
      8
      2.666667
      5
      1.666667
      5
      5
      2
      ...
      0
      13
      10261
      9879
      1.91
      3.40
      4.20
      12
      12
      D
    
    
      0
      0
      1
      1
      0
      0.000000
      3
      1.500000
      6
      3
      3
      ...
      0
      13
      8456
      8658
      1.40
      4.50
      8.50
      8
      12
      D
    
    
      0
      1
      1
      0
      4
      2.000000
      4
      2.000000
      4
      7
      1
      ...
      0
      13
      10194
      8650
      3.75
      3.25
      2.10
      12
      8
      H
    
    
      0
      0
      0
      2
      2
      1.000000
      2
      1.000000
      2
      5
      5
      ...
      0
      13
      8528
      8659
      2.50
      3.25
      2.88
      13
      13
      H
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      0
      0
      2
      0
      0
      0.000000
      2
      1.000000
      6
      16
      11
      ...
      5
      34
      10238
      6403
      2.30
      3.50
      2.90
      8
      5
      D
    
    
      0
      0
      1
      1
      1
      0.500000
      3
      1.500000
      9
      13
      11
      ...
      5
      34
      9807
      7842
      3.10
      3.50
      2.20
      6
      5
      H
    
    
      0
      2
      0
      0
      3
      1.500000
      1
      0.500000
      28
      4
      1
      ...
      5
      34
      9772
      10214
      1.10
      9.00
      21.00
      3
      6
      H
    
    
      0
      2
      0
      1
      7
      2.333333
      3
      1.000000
      16
      7
      10
      ...
      5
      34
      10264
      9768
      7.00
      4.33
      1.44
      8
      7
      A
    
    
      0
      1
      1
      0
      2
      1.000000
      3
      1.500000
      2
      1
      2
      ...
      5
      6
      158085
      9807
      2.40
      3.00
      3.20
      6
      6
      D
    
    
      0
      1
      1
      0
      1
      0.500000
      1
      0.500000
      2
      2
      1
      ...
      5
      6
      8613
      9768
      9.50
      5.00
      1.33
      8
      7
      D
    
    
      0
      0
      1
      1
      0
      0.000000
      2
      1.000000
      0
      4
      1
      ...
      5
      6
      8348
      9773
      15.00
      5.50
      1.22
      7
      6
      D
    
    
      0
      2
      0
      0
      3
      1.500000
      0
      0.000000
      2
      3
      0
      ...
      5
      6
      10214
      10238
      1.91
      3.40
      4.00
      7
      6
      D
    
    
      0
      1
      1
      0
      2
      1.000000
      2
      1.000000
      2
      1
      2
      ...
      5
      6
      7841
      10215
      1.62
      3.60
      6.00
      2
      7
      H
    
    
      0
      1
      0
      1
      2
      1.000000
      1
      0.500000
      1
      1
      3
      ...
      5
      6
      7844
      10264
      3.50
      3.00
      2.25
      6
      7
      A
    
    
      0
      0
      3
      0
      1
      0.333333
      9
      3.000000
      0
      6
      0
      ...
      5
      7
      10215
      10212
      4.00
      3.10
      2.05
      7
      6
      H
    
    
      0
      0
      1
      1
      1
      0.500000
      4
      2.000000
      2
      2
      2
      ...
      5
      7
      6403
      10214
      2.20
      3.50
      3.10
      7
      6
      H
    
    
      0
      2
      0
      0
      9
      4.500000
      1
      0.500000
      4
      2
      0
      ...
      5
      7
      10264
      158085
      1.53
      4.00
      6.00
      8
      6
      D
    
    
      0
      2
      0
      0
      3
      1.500000
      0
      0.000000
      4
      0
      2
      ...
      5
      7
      9773
      9807
      1.14
      7.00
      21.00
      7
      6
      H
    
    
      0
      2
      1
      0
      3
      1.000000
      2
      0.666667
      3
      1
      2
      ...
      5
      7
      7841
      8613
      2.05
      3.30
      3.60
      7
      6
      H
    
    
      0
      0
      0
      2
      4
      2.000000
      4
      2.000000
      1
      1
      4
      ...
      5
      7
      10238
      7842
      2.50
      3.20
      2.88
      8
      6
      H
    
    
      0
      1
      0
      1
      2
      1.000000
      1
      0.500000
      4
      0
      2
      ...
      5
      7
      9768
      7844
      1.29
      5.50
      10.00
      8
      6
      H
    
    
      0
      3
      0
      0
      12
      4.000000
      2
      0.666667
      4
      2
      1
      ...
      5
      8
      9772
      9768
      2.20
      3.20
      3.40
      8
      7
      A
    
    
      0
      2
      0
      0
      3
      1.500000
      1
      0.500000
      4
      3
      0
      ...
      5
      8
      7842
      7841
      2.20
      3.10
      3.50
      7
      6
      D
    
    
      0
      0
      0
      2
      2
      1.000000
      2
      1.000000
      0
      4
      3
      ...
      5
      8
      8348
      10238
      2.30
      3.20
      3.20
      8
      7
      A
    
    
      0
      1
      1
      0
      1
      0.500000
      1
      0.500000
      1
      3
      3
      ...
      5
      8
      7844
      10215
      1.70
      3.60
      5.00
      7
      5
      D
    
    
      0
      1
      0
      1
      3
      1.500000
      1
      0.500000
      2
      4
      1
      ...
      5
      8
      10214
      8613
      1.91
      3.25
      4.33
      7
      5
      D
    
    
      0
      3
      0
      0
      7
      2.333333
      0
      0.000000
      5
      0
      2
      ...
      5
      8
      9773
      10264
      1.33
      4.75
      9.50
      7
      7
      D
    
    
      0
      2
      0
      0
      6
      3.000000
      2
      1.000000
      2
      3
      2
      ...
      5
      8
      10212
      6403
      1.91
      3.40
      4.00
      7
      6
      A
    
    
      0
      0
      1
      1
      0
      0.000000
      1
      0.500000
      2
      3
      3
      ...
      5
      9
      8613
      10212
      2.70
      3.20
      2.60
      7
      6
      A
    
    
      0
      1
      1
      0
      3
      1.500000
      4
      2.000000
      4
      2
      2
      ...
      5
      9
      6403
      7844
      2.15
      3.20
      3.50
      6
      5
      A
    
    
      0
      1
      0
      1
      3
      1.500000
      2
      1.000000
      3
      1
      4
      ...
      5
      9
      10238
      158085
      2.10
      3.10
      3.75
      8
      5
      D
    
    
      0
      1
      0
      1
      5
      2.500000
      1
      0.500000
      4
      2
      2
      ...
      5
      9
      10264
      9807
      1.50
      4.00
      6.50
      8
      5
      H
    
    
      0
      2
      0
      0
      6
      3.000000
      1
      0.500000
      6
      0
      2
      ...
      5
      9
      9768
      7842
      1.20
      6.00
      15.00
      3
      5
      H
    
    
      0
      2
      1
      0
      3
      1.000000
      2
      0.666667
      4
      1
      3
      ...
      5
      9
      7841
      10214
      2.30
      3.10
      3.30
      6
      6
      H
    
  

8791 rows × 97 columns



In [ ]:



In [5]:

    
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing 
le = preprocessing.LabelEncoder()

scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='log_loss')
print scores.mean()









    



/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)






    



-0.981429559041






    



/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)



In [9]:

    
pure_data = pd.read_csv('pure_data.csv',index_col='Unnamed: 0')
pure_formation = pd.read_csv('pure_formation.csv',index_col='Unnamed: 0')

ft_pure_data=pure_data.columns.values[0:len(pure_data.columns.values)-1]
ft_pure_formation=['country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id', 'B365H',
       'B365D', 'B365A', 'formation_h', 'formation_a']
pure_formation['formation_a'] = le.fit_transform(pure_formation['formation_a'].astype('str')) 
pure_formation['formation_h'] = le.fit_transform(pure_formation['formation_h'].astype('str'))



In [7]:

    
print pure_data.shape
print pure_formation.shape









    



(19685, 56)
(19685, 14)



In [8]:

    
scores = cross_val_score(LogisticRegression(), pure_data[ft_pure_data], pure_data['Output'], cv=10, scoring='neg_log_loss')
print "Pure Data"
print scores.mean()









    



Pure Data
-1.06908234659



In [8]:

    
print ft_pure_formation
scores = cross_val_score(LogisticRegression(), pure_formation[ft_pure_formation], pure_formation['Output'], cv=10, scoring='neg_log_loss')
print "Pure Formation"
print scores.mean()









    



['country_id', 'league_id', 'season', 'stage', 'date', 'match_api_id', 'home_team_api_id', 'away_team_api_id', 'B365H', 'B365D', 'B365A', 'formation_h', 'formation_a']
Pure Formation
-1.06923334431



In [10]:

    
# Dados com feature Engineering total
data_min = pd.DataFrame()
data_mean = pd.DataFrame()
for column in ft:
    data_min[column]=(data[column] - data[column].min()) / (data[column].max()-data[column].min())
for column in ft:
    data_mean[column]=(data[column] - data[column].mean()) / data[column].std()

    
scores = cross_val_score(LogisticRegression(), data_min[ft], data['Output'], cv=10, scoring='neg_log_loss')
print "MinMax"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data_mean[ft], data['Output'], cv=10, scoring='neg_log_loss')
print "Mean"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
print "Normal"
print scores.mean()









    



MinMax
-0.987484303268
Mean
-0.991968851075
Normal
-0.981429559041



In [ ]:

    
#Classificador mais simple possivel
print data['Output'].value_counts()

print 4042./data.shape[0]









    



H    4042
A    2528
D    2221
Name: Output, dtype: int64
0.459788419975



In [ ]:

    
scores = cross_val_score(LogisticRegression(), data_min[ft], data['Output'], cv=10, scoring='log_loss')
print "MinMax"
print scores.mean(),scores.std()
scores = cross_val_score(LogisticRegression(), data_mean[ft], data['Output'], cv=10, scoring='log_loss')
print "Mean"
print scores.mean(),scores.std()
scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='log_loss')
print "Normal"
print scores.mean(),scores.std()









    



/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)






    



MinMax
-0.987484303268 0.025794254341






    



/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)






    



Mean
-0.991968851075 0.0279115513172






    



/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)



In [10]:

    
from sklearn.neighbors import KNeighborsClassifier
k_range=range(50,200)
k_score=[]
for k in k_range:
    scores = cross_val_score((KNeighborsClassifier(n_neighbors=k)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
    k_score.append(scores.mean())



In [11]:

    
plt.scatter(k_range,k_score)
plt.show()



In [14]:

    
scores = cross_val_score((KNeighborsClassifier(n_neighbors=100)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
print scores.mean()









    



-1.09027134044



In [ ]:



In [16]:

    
from  sklearn.ensemble import RandomForestClassifier

k_range=[300,500,800,1000,1300,1500,1800,2000,3000,4000,5000]
k_score=[]
for k in k_range:
    scores = cross_val_score((RandomForestClassifier(n_estimators=k, max_depth=3, n_jobs=-1)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
    k_score.append(scores.mean())



In [ ]:

    
print len(k_score)



In [ ]:

    
print len(k_score)
plt.scatter([300,500,800,1000,1300,1500,1800,2000,3000,4000,5000],k_score)
plt.show()



In [ ]:

    
from  sklearn.ensemble import GradientBoostingClassifier

k_range=[1000,2000,3000,4000]
k_score=[]
for k in k_range:
    scores = cross_val_score((GradientBoostingClassifier(n_estimators=k)), data[ft], data['Output'], cv=5, scoring='accuracy')
    k_score.append(scores.mean())



In [ ]:

    
len(k_score)
plt.scatter([1000,2000],k_score)
plt.show()



In [ ]:

    
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

dt = SelectKBest(chi2, k=48).fit_transform(data[ft],data['Output'])
print dt.shape
print data.shape
print dt[0]

scores = cross_val_score(LogisticRegression(), dt, data['Output'], cv=10, scoring='log_loss')
print scores.mean()



In [ ]:



In [ ]:

    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[ft],data['Output'],test_size=0.33)

lm = LogisticRegression()
lm.fit(X_train,y_train)
prob= lm.predict_proba(X_test)
pre =  lm.predict(X_test)
print prob[0],pre[0]
print pre

print lm.score(X_test,y_test)



In [ ]:

    
data.columns.values



In [11]:

    
play_style = ['h_buildUpPlaySpeed',
       'h_buildUpPlaySpeedClass', 'h_buildUpPlayDribblingClass',
       'h_buildUpPlayPassing', 'h_buildUpPlayPassingClass',
       'h_buildUpPlayPositioningClass', 'h_chanceCreationPassing',
       'h_chanceCreationPassingClass', 'h_chanceCreationCrossing',
       'h_chanceCreationCrossingClass', 'h_chanceCreationShooting',
       'h_chanceCreationShootingClass', 'h_chanceCreationPositioningClass',
       'h_defencePressure', 'h_defencePressureClass',
       'h_defenceAggression', 'h_defenceAggressionClass',
       'h_defenceTeamWidth', 'h_defenceTeamWidthClass',
       'h_defenceDefenderLineClass', 'a_buildUpPlaySpeed',
       'a_buildUpPlaySpeedClass', 'a_buildUpPlayDribblingClass',
       'a_buildUpPlayPassing', 'a_buildUpPlayPassingClass',
       'a_buildUpPlayPositioningClass', 'a_chanceCreationPassing',
       'a_chanceCreationPassingClass', 'a_chanceCreationCrossing',
       'a_chanceCreationCrossingClass', 'a_chanceCreationShooting',
       'a_chanceCreationShootingClass', 'a_chanceCreationPositioningClass',
       'a_defencePressure', 'a_defencePressureClass',
       'a_defenceAggression', 'a_defenceAggressionClass',
       'a_defenceTeamWidth', 'a_defenceTeamWidthClass',
       'a_defenceDefenderLineClass']

fifa_ratings=['h_avg_height', 'h_avg_weight',
       'a_avg_height', 'a_avg_weight', 'h_overall', 'h_potential', 'h_def',
       'h_mid', 'h_att', 'a_overall', 'a_potential', 'a_def', 'a_mid',
       'a_att']
base= ['season', 'stage','league_id','a_date', 'h_date','B365H',
       'B365D', 'B365A', 'formation_h', 'formation_a']

goals = ['home_V', 'home_D', 'home_E', 'home_GF', 'home_AVG_GF', 'home_GS',
       'home_AVG_GS', 'home_VG', 'home_DG', 'home_EG', 'home_GFG',
       'home_AVG_GFG', 'home_GSG', 'home_AVG_GSG', 'away_V', 'away_D',
       'away_E', 'away_GF', 'away_AVG_GF', 'away_GS', 'away_AVG_GS',
       'away_VG', 'away_DG', 'away_EG', 'away_GFG', 'away_AVG_GFG',
       'away_GSG']

print goals+base









    



['home_V', 'home_D', 'home_E', 'home_GF', 'home_AVG_GF', 'home_GS', 'home_AVG_GS', 'home_VG', 'home_DG', 'home_EG', 'home_GFG', 'home_AVG_GFG', 'home_GSG', 'home_AVG_GSG', 'away_V', 'away_D', 'away_E', 'away_GF', 'away_AVG_GF', 'away_GS', 'away_AVG_GS', 'away_VG', 'away_DG', 'away_EG', 'away_GFG', 'away_AVG_GFG', 'away_GSG', 'season', 'stage', 'league_id', 'a_date', 'h_date', 'B365H', 'B365D', 'B365A', 'formation_h', 'formation_a']



In [ ]:

    
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+goals+base], data['Output'], cv=10, scoring='log_loss')
print scores.mean()



In [12]:

    
from  sklearn.ensemble import RandomForestClassifier
scores = cross_val_score((RandomForestClassifier(n_estimators=1000, max_depth=3)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
print scores.mean()









    



-0.984984243937



In [ ]:

    
k_range=[1000,2000,3000,4000,5000]
k_score=[]
for k in k_range:
    scores = cross_val_score((RandomForestClassifier(n_estimators=k, max_depth=3)), data[fifa_ratings+goals+base], data['Output'], cv=10, scoring='accuracy')
    k_score.append(scores.mean())



In [ ]:

    
print k_score



In [15]:

    
scores = cross_val_score(LogisticRegression(), data[base], data['Output'], cv=10, scoring='neg_log_loss')
print "Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals+Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings+Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+play_style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+goals+play_style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+goals+play_style"
print scores.mean()









    



Base
-0.973562814441
Ratings
-0.984979257755
Goals
-1.03314259493
Style
-1.06146631889
Base+goals
-0.974094074696
Base+Ratings
-0.97333685394
Base+Style
-0.979329914618
Rating+Goals
-0.986844484151
Goals+Style
-1.03907113122
Ratings+Style
-0.998159144845
base+Rating+Goals
-0.974709147966
base+Rating+play_style
-0.980211581839
base+goals+play_style
-0.981671730394
Rating+goals+play_style
-1.0006185998



In [17]:

    
scores = cross_val_score(RandomForestClassifier(n_estimators=1000), data[play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[base+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals+Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings+Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[fifa_ratings+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+goals+play_style"
print scores.mean()









    



Style
-1.08837422037
Base+Style
-1.00342865589
Goals+Style
-1.04459693463
Ratings+Style
-1.00359251103
Rating+goals+play_style
-0.996481363352



In [ ]:



In [16]:

    
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base], data['Output'], cv=10, scoring='neg_log_loss')
print "Base"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Ratings"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+Goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals+Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings+Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+Goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+play_style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+goals+play_style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+goals+play_style"
print scores.mean()









    



Base
-1.03659004831
Ratings
-0.994091173579
Goals
-1.03785982671
Style
-1.06445276755
Base+goals
-1.04955993303
Base+Ratings
-1.00901515738
Base+Style
-1.07038982999
Rating+Goals
-0.992641108592
Goals+Style
-1.04071347198
Ratings+Style
-1.02397472781
base+Rating+Goals
-1.02632818715
base+Rating+play_style
-1.04787193518
base+goals+play_style
-1.0604808121
Rating+goals+play_style
-1.01855727069



In [ ]:

    
print uk['Output'].value_counts()[0]/(uk.shape[0]*1.)
print de['Output'].value_counts()[0]/(de.shape[0]*1.)
print fr['Output'].value_counts()[0]/(fr.shape[0]*1.)
print it['Output'].value_counts()[0]/(it.shape[0]*1.)
print pt['Output'].value_counts()[0]/(pt.shape[0]*1.)
print es['Output'].value_counts()[0]/(es.shape[0]*1.)



In [ ]:

    
scores = cross_val_score(LogisticRegression(), uk[base], uk['Output'], cv=10, scoring='log_loss')
print "Base"
print scores.mean()



In [ ]:

    
mandante_forte = data[data['h_overall']>= (data['a_overall']+3)] 
visitante_forte = data[data['a_overall']>= (data['h_overall']+3)] 
iguais = data[abs(data['h_overall'] -(data['a_overall'])) < 3 ] 

print mandante_forte['Output'].value_counts()[0]/(mandante_forte.shape[0]*1.)
print visitante_forte['Output'].value_counts()[0]/(visitante_forte.shape[0]*1.)
print iguais['Output'].value_counts()[0]/(iguais.shape[0]*1.)



In [ ]:

    
print mandante_forte.shape[0]+visitante_forte.shape[0]+ iguais.shape[0]

print uk.shape[0]+de.shape[0]+es.shape[0]+it.shape[0]+fr.shape[0]+pt.shape[0]



In [18]:

    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[ft],data['Output'], test_size=0.33)

treino= pd.concat([X_train,y_train],axis=1)
mandante_forte = treino[treino['h_overall']>= (treino['a_overall']+3)] 
visitante_forte = treino[treino['a_overall']>= (treino['h_overall']+3)] 
iguais = treino[abs(treino['h_overall'] -(treino['a_overall'])) < 3 ]

print mandante_forte['Output'].value_counts()[0]/(mandante_forte.shape[0]*1.)
print visitante_forte['Output'].value_counts()[0]/(visitante_forte.shape[0]*1.)
print iguais['Output'].value_counts()[0]/(iguais.shape[0]*1.)









    



0.66847826087
0.499057196732
0.460635881908



In [19]:

    
print "Mandante Forte"

scores = cross_val_score(LogisticRegression(), mandante_forte[base], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), mandante_forte[base+goals], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base-Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), mandante_forte[base+fifa_ratings], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), mandante_forte[base+goals+fifa_ratings], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+goals+ratings"
print scores.mean()

scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base+goals] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base-Goals"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base+fifa_ratings] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base+goals+fifa_ratings] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+goals+ratings"
print scores.mean()









    



Mandante Forte
Regressão Logistica - Base
-0.814989731478
Regressão Logistica - Base-Goals
-0.816478929558
Regressão Logistica - Base+ratings
-0.820619992537
Regressão Logistica - Base+goals+ratings
-0.823411274004
Random - Base
-0.852166724971
Random - Base-Goals
-0.837783283072
Random - Base+ratings
-0.841615729019
Random - Base+goals+ratings
-0.834640409641



In [43]:

    
print len(k_score)

plt.scatter([100,500,1000,2000],k_score)
plt.show()



In [20]:

    
print "visitante Forte"

scores = cross_val_score(LogisticRegression(), visitante_forte[base], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), visitante_forte[base+goals], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base-Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), visitante_forte[base+fifa_ratings], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), visitante_forte[base+goals+fifa_ratings], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+goals+ratings"
print scores.mean()

scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base+goals] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base-Goals"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base+fifa_ratings] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base+goals+fifa_ratings] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+goals+ratings"
print scores.mean()









    



visitante Forte
Regressão Logistica - Base
-1.0036004086
Regressão Logistica - Base-Goals
-1.01224050007
Regressão Logistica - Base+ratings
-1.01356365601
Regressão Logistica - Base+goals+ratings
-1.02250358003
Random - Base
-1.07802630191
Random - Base-Goals
-1.03579179506
Random - Base+ratings
-1.041927169
Random - Base+goals+ratings
-1.02999456242



In [22]:

    
print "iguais Forte"

scores = cross_val_score(LogisticRegression(), iguais[base], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), iguais[base+goals], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base-Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), iguais[base+fifa_ratings], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), iguais[base+goals+fifa_ratings], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+goals+ratings"
print scores.mean()

scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base+goals] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base-Goals"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base+fifa_ratings] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base+goals+fifa_ratings] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+goals+ratings"
print scores.mean()









    



iguais Forte
Regressão Logistica - Base
-1.05508327122
Regressão Logistica - Base-Goals
-1.05845467839
Regressão Logistica - Base+ratings
-1.05967056878
Regressão Logistica - Base+goals+ratings
-1.06414120982
Random - Base
-1.10259356201
Random - Base-Goals
-1.07685361898
Random - Base+ratings
-1.07201231689
Random - Base+goals+ratings
-1.06899597774



In [23]:

    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[ft],data['Output'], test_size=0.33)

treino= pd.concat([X_train,y_train],axis=1)
mandante_forte = treino[treino['h_overall']>= (treino['a_overall']+3)] 
visitante_forte = treino[treino['a_overall']>= (treino['h_overall']+3)] 
iguais = treino[abs(treino['h_overall'] -(treino['a_overall'])) < 3 ]

print mandante_forte['Output'].value_counts()[0]/(mandante_forte.shape[0]*1.)
print visitante_forte['Output'].value_counts()[0]/(visitante_forte.shape[0]*1.)
print iguais['Output'].value_counts()[0]/(iguais.shape[0]*1.)









    



0.666666666667
0.509194673431
0.452480121166



In [24]:

    
reg_H = LogisticRegression()
reg_H.fit(mandante_forte[base],mandante_forte['Output'])
reg_A = LogisticRegression()
reg_A.fit(visitante_forte[base],visitante_forte['Output'])
reg_I = LogisticRegression()
reg_I.fit(iguais[base],iguais['Output'])









    Out[24]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [25]:

    
teste= pd.concat([X_train,y_train],axis=1)
m = treino[teste['h_overall']>= (teste['a_overall']+3)] 
v = treino[teste['a_overall']>= (teste['h_overall']+3)] 
i = treino[abs(teste['h_overall'] -(teste['a_overall'])) < 3 ]



In [32]:

    
print reg_H.predict_log_proba(m[base]).mean()
print reg_A.predict_log_proba(v[base]).mean()
print reg_I.predict_log_proba(i[base]).mean()
print (-1.49989713783 + -1.21385058598  + -1.14233720246)/3









    



-1.49989713783
-1.21385058598
-1.14233720246
-1.28536164209



In [35]:

    
scores = cross_val_score(LogisticRegression(), data[base], data['Output'], cv=10, scoring='accuracy')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='accuracy')
print "Regressão Logistica - Base"
print scores.mean()









    



Regressão Logistica - Base
0.532708042354
Regressão Logistica - Base
0.522581341142

	home_V	home_D	home_E	home_GF	home_AVG_GF	home_GS	home_AVG_GS	home_VG	home_DG	home_EG	...	season	stage	home_team_api_id	away_team_api_id	B365H	B365D	B365A	formation_h	formation_a	Output
0	1	1	0	4	2.000000	4	2.000000	5	2	2	...	0	10	9825	8654	1.17	7.00	17.00	8	5	H
0	0	0	2	1	0.500000	1	0.500000	3	4	2	...	0	10	10252	8658	1.80	3.60	4.50	13	13	D
0	1	1	0	2	1.000000	1	0.500000	2	3	4	...	0	10	8668	10194	1.57	3.75	6.50	13	12	H
0	0	1	1	3	1.500000	4	2.000000	3	4	2	...	0	10	10261	8472	2.20	3.20	3.50	12	8	H
0	0	1	1	2	1.000000	3	1.500000	1	5	3	...	0	10	8602	8456	5.50	3.40	1.73	13	13	H
0	0	0	2	1	0.500000	1	0.500000	2	4	3	...	0	10	8655	8455	7.50	4.33	1.44	13	11	A
0	0	1	1	1	0.500000	2	1.000000	1	2	6	...	0	10	9879	8528	1.91	3.40	4.20	13	12	H
0	1	0	1	4	2.000000	2	1.000000	5	0	5	...	0	11	10260	8602	1.22	6.00	15.00	13	13	H
0	2	1	0	5	1.666667	4	1.333333	6	2	2	...	0	11	9825	10261	1.29	5.50	11.00	11	12	A
0	1	1	1	5	1.666667	5	1.666667	3	4	3	...	0	11	8650	8455	3.80	3.40	2.00	12	11	H
0	1	0	1	1	0.500000	0	0.000000	2	2	6	...	0	11	8472	10194	2.10	3.20	3.75	12	12	H
0	0	1	1	1	0.500000	2	1.000000	2	5	3	...	0	11	8655	8528	1.80	3.60	4.50	5	13	H
0	1	1	1	3	1.000000	2	0.666667	2	2	6	...	0	11	9879	10252	2.20	3.25	3.40	13	13	D
0	1	1	1	2	0.666667	2	0.666667	2	3	5	...	0	11	8658	8654	1.91	3.40	4.20	12	11	D
0	1	2	0	5	1.666667	6	2.000000	4	5	1	...	0	11	8483	8668	4.50	3.60	1.80	11	13	D
0	1	0	1	3	1.500000	2	1.000000	4	4	3	...	0	12	8586	8472	1.53	4.00	6.50	13	13	D
0	0	1	1	2	1.000000	3	1.500000	1	6	4	...	0	12	8654	8659	2.30	3.25	3.20	12	13	D
0	1	0	1	3	1.500000	2	1.000000	2	6	3	...	0	12	8602	9825	5.50	3.60	1.67	13	8	A
0	1	1	0	2	1.000000	2	1.000000	3	7	1	...	0	12	10194	8658	2.10	3.20	3.75	12	12	H
0	1	0	1	3	1.500000	1	0.500000	2	5	4	...	0	12	8528	8650	4.75	3.60	1.75	13	12	D
0	1	0	1	7	3.500000	3	1.500000	5	4	2	...	0	12	10261	8655	1.91	3.40	4.20	12	1	A
0	2	0	0	4	2.000000	0	0.000000	8	2	1	...	0	12	8455	9879	1.20	6.50	15.00	11	13	H
0	0	0	2	2	1.000000	2	1.000000	4	4	4	...	0	13	8586	8655	1.57	3.80	6.50	12	15	H
0	0	1	1	3	1.500000	4	2.000000	1	6	5	...	0	13	8654	8483	1.91	3.60	4.00	12	8	D
0	1	0	2	3	1.000000	2	0.666667	4	4	4	...	0	13	10252	10260	4.50	3.60	1.80	13	12	D
0	2	0	1	4	1.333333	1	0.333333	3	3	6	...	0	13	8668	9825	3.40	3.30	2.20	13	8	A
0	1	1	1	8	2.666667	5	1.666667	5	5	2	...	0	13	10261	9879	1.91	3.40	4.20	12	12	D
0	0	1	1	0	0.000000	3	1.500000	6	3	3	...	0	13	8456	8658	1.40	4.50	8.50	8	12	D
0	1	1	0	4	2.000000	4	2.000000	4	7	1	...	0	13	10194	8650	3.75	3.25	2.10	12	8	H
0	0	0	2	2	1.000000	2	1.000000	2	5	5	...	0	13	8528	8659	2.50	3.25	2.88	13	13	H
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
0	0	2	0	0	0.000000	2	1.000000	6	16	11	...	5	34	10238	6403	2.30	3.50	2.90	8	5	D
0	0	1	1	1	0.500000	3	1.500000	9	13	11	...	5	34	9807	7842	3.10	3.50	2.20	6	5	H
0	2	0	0	3	1.500000	1	0.500000	28	4	1	...	5	34	9772	10214	1.10	9.00	21.00	3	6	H
0	2	0	1	7	2.333333	3	1.000000	16	7	10	...	5	34	10264	9768	7.00	4.33	1.44	8	7	A
0	1	1	0	2	1.000000	3	1.500000	2	1	2	...	5	6	158085	9807	2.40	3.00	3.20	6	6	D
0	1	1	0	1	0.500000	1	0.500000	2	2	1	...	5	6	8613	9768	9.50	5.00	1.33	8	7	D
0	0	1	1	0	0.000000	2	1.000000	0	4	1	...	5	6	8348	9773	15.00	5.50	1.22	7	6	D
0	2	0	0	3	1.500000	0	0.000000	2	3	0	...	5	6	10214	10238	1.91	3.40	4.00	7	6	D
0	1	1	0	2	1.000000	2	1.000000	2	1	2	...	5	6	7841	10215	1.62	3.60	6.00	2	7	H
0	1	0	1	2	1.000000	1	0.500000	1	1	3	...	5	6	7844	10264	3.50	3.00	2.25	6	7	A
0	0	3	0	1	0.333333	9	3.000000	0	6	0	...	5	7	10215	10212	4.00	3.10	2.05	7	6	H
0	0	1	1	1	0.500000	4	2.000000	2	2	2	...	5	7	6403	10214	2.20	3.50	3.10	7	6	H
0	2	0	0	9	4.500000	1	0.500000	4	2	0	...	5	7	10264	158085	1.53	4.00	6.00	8	6	D
0	2	0	0	3	1.500000	0	0.000000	4	0	2	...	5	7	9773	9807	1.14	7.00	21.00	7	6	H
0	2	1	0	3	1.000000	2	0.666667	3	1	2	...	5	7	7841	8613	2.05	3.30	3.60	7	6	H
0	0	0	2	4	2.000000	4	2.000000	1	1	4	...	5	7	10238	7842	2.50	3.20	2.88	8	6	H
0	1	0	1	2	1.000000	1	0.500000	4	0	2	...	5	7	9768	7844	1.29	5.50	10.00	8	6	H
0	3	0	0	12	4.000000	2	0.666667	4	2	1	...	5	8	9772	9768	2.20	3.20	3.40	8	7	A
0	2	0	0	3	1.500000	1	0.500000	4	3	0	...	5	8	7842	7841	2.20	3.10	3.50	7	6	D
0	0	0	2	2	1.000000	2	1.000000	0	4	3	...	5	8	8348	10238	2.30	3.20	3.20	8	7	A
0	1	1	0	1	0.500000	1	0.500000	1	3	3	...	5	8	7844	10215	1.70	3.60	5.00	7	5	D
0	1	0	1	3	1.500000	1	0.500000	2	4	1	...	5	8	10214	8613	1.91	3.25	4.33	7	5	D
0	3	0	0	7	2.333333	0	0.000000	5	0	2	...	5	8	9773	10264	1.33	4.75	9.50	7	7	D
0	2	0	0	6	3.000000	2	1.000000	2	3	2	...	5	8	10212	6403	1.91	3.40	4.00	7	6	A
0	0	1	1	0	0.000000	1	0.500000	2	3	3	...	5	9	8613	10212	2.70	3.20	2.60	7	6	A
0	1	1	0	3	1.500000	4	2.000000	4	2	2	...	5	9	6403	7844	2.15	3.20	3.50	6	5	A
0	1	0	1	3	1.500000	2	1.000000	3	1	4	...	5	9	10238	158085	2.10	3.10	3.75	8	5	D
0	1	0	1	5	2.500000	1	0.500000	4	2	2	...	5	9	10264	9807	1.50	4.00	6.50	8	5	H
0	2	0	0	6	3.000000	1	0.500000	6	0	2	...	5	9	9768	7842	1.20	6.00	15.00	3	5	H
0	2	1	0	3	1.000000	2	0.666667	4	1	3	...	5	9	7841	10214	2.30	3.10	3.30	6	6	H