In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3
import numpy as np
from numpy import random
database = 'database.sqlite'
conn = sqlite3.connect(database)

In [2]:
uk = pd.read_csv('./leagues/1729.csv',index_col='Unnamed: 0')
de = pd.read_csv('./leagues/7809.csv',index_col='Unnamed: 0')
fr = pd.read_csv('./leagues/4769.csv',index_col='Unnamed: 0')
it = pd.read_csv('./leagues/10257.csv',index_col='Unnamed: 0')
es = pd.read_csv('./leagues/21518.csv',index_col='Unnamed: 0')
pt = pd.read_csv('./leagues/17642.csv',index_col='Unnamed: 0')

data= pd.concat([uk,de])
data1= pd.concat([es,fr])
data= pd.concat([data,it])
data1= pd.concat([data1,pt])
data= pd.concat([data,data1])

In [3]:
ft = ['home_V','home_D','home_E','home_GF','home_AVG_GF','home_GS','home_AVG_GS','home_VG','home_DG','home_EG','home_GFG','home_AVG_GFG','home_GSG','home_AVG_GSG','away_V','away_D','away_E','away_GF','away_AVG_GF','away_GS','away_AVG_GS','away_VG','away_DG','away_EG','away_GFG','away_AVG_GFG','away_GSG','away_AVG_GSG','h_buildUpPlaySpeed','h_buildUpPlaySpeedClass','h_buildUpPlayDribblingClass','h_buildUpPlayPassing','h_buildUpPlayPassingClass','h_buildUpPlayPositioningClass','h_chanceCreationPassing','h_chanceCreationPassingClass','h_chanceCreationCrossing','h_chanceCreationCrossingClass','h_chanceCreationShooting','h_chanceCreationShootingClass','h_chanceCreationPositioningClass','h_defencePressure','h_defencePressureClass','h_defenceAggression','h_defenceAggressionClass','h_defenceTeamWidth','h_defenceTeamWidthClass','h_defenceDefenderLineClass','a_buildUpPlaySpeed','a_buildUpPlaySpeedClass','a_buildUpPlayDribblingClass','a_buildUpPlayPassing','a_buildUpPlayPassingClass','a_buildUpPlayPositioningClass','a_chanceCreationPassing','a_chanceCreationPassingClass','a_chanceCreationCrossing','a_chanceCreationCrossingClass','a_chanceCreationShooting','a_chanceCreationShootingClass','a_chanceCreationPositioningClass','a_defencePressure','a_defencePressureClass','a_defenceAggression','a_defenceAggressionClass','a_defenceTeamWidth','a_defenceTeamWidthClass','a_defenceDefenderLineClass','h_avg_height','h_avg_weight','a_avg_height','a_avg_weight','h_overall','h_potential','h_def','h_mid','h_att','a_overall','a_potential','a_def','a_mid','a_att','a_date','h_date','id','season','stage','home_team_api_id','away_team_api_id','B365H','B365D','B365A','formation_h','formation_a','league_id'] 

#for column in ft:
#    data[column]=(data[column] - data[column].mean()) / data[column].std()

In [4]:
data


Out[4]:
home_V home_D home_E home_GF home_AVG_GF home_GS home_AVG_GS home_VG home_DG home_EG ... season stage home_team_api_id away_team_api_id B365H B365D B365A formation_h formation_a Output
0 1 1 0 4 2.000000 4 2.000000 5 2 2 ... 0 10 9825 8654 1.17 7.00 17.00 8 5 H
0 0 0 2 1 0.500000 1 0.500000 3 4 2 ... 0 10 10252 8658 1.80 3.60 4.50 13 13 D
0 1 1 0 2 1.000000 1 0.500000 2 3 4 ... 0 10 8668 10194 1.57 3.75 6.50 13 12 H
0 0 1 1 3 1.500000 4 2.000000 3 4 2 ... 0 10 10261 8472 2.20 3.20 3.50 12 8 H
0 0 1 1 2 1.000000 3 1.500000 1 5 3 ... 0 10 8602 8456 5.50 3.40 1.73 13 13 H
0 0 0 2 1 0.500000 1 0.500000 2 4 3 ... 0 10 8655 8455 7.50 4.33 1.44 13 11 A
0 0 1 1 1 0.500000 2 1.000000 1 2 6 ... 0 10 9879 8528 1.91 3.40 4.20 13 12 H
0 1 0 1 4 2.000000 2 1.000000 5 0 5 ... 0 11 10260 8602 1.22 6.00 15.00 13 13 H
0 2 1 0 5 1.666667 4 1.333333 6 2 2 ... 0 11 9825 10261 1.29 5.50 11.00 11 12 A
0 1 1 1 5 1.666667 5 1.666667 3 4 3 ... 0 11 8650 8455 3.80 3.40 2.00 12 11 H
0 1 0 1 1 0.500000 0 0.000000 2 2 6 ... 0 11 8472 10194 2.10 3.20 3.75 12 12 H
0 0 1 1 1 0.500000 2 1.000000 2 5 3 ... 0 11 8655 8528 1.80 3.60 4.50 5 13 H
0 1 1 1 3 1.000000 2 0.666667 2 2 6 ... 0 11 9879 10252 2.20 3.25 3.40 13 13 D
0 1 1 1 2 0.666667 2 0.666667 2 3 5 ... 0 11 8658 8654 1.91 3.40 4.20 12 11 D
0 1 2 0 5 1.666667 6 2.000000 4 5 1 ... 0 11 8483 8668 4.50 3.60 1.80 11 13 D
0 1 0 1 3 1.500000 2 1.000000 4 4 3 ... 0 12 8586 8472 1.53 4.00 6.50 13 13 D
0 0 1 1 2 1.000000 3 1.500000 1 6 4 ... 0 12 8654 8659 2.30 3.25 3.20 12 13 D
0 1 0 1 3 1.500000 2 1.000000 2 6 3 ... 0 12 8602 9825 5.50 3.60 1.67 13 8 A
0 1 1 0 2 1.000000 2 1.000000 3 7 1 ... 0 12 10194 8658 2.10 3.20 3.75 12 12 H
0 1 0 1 3 1.500000 1 0.500000 2 5 4 ... 0 12 8528 8650 4.75 3.60 1.75 13 12 D
0 1 0 1 7 3.500000 3 1.500000 5 4 2 ... 0 12 10261 8655 1.91 3.40 4.20 12 1 A
0 2 0 0 4 2.000000 0 0.000000 8 2 1 ... 0 12 8455 9879 1.20 6.50 15.00 11 13 H
0 0 0 2 2 1.000000 2 1.000000 4 4 4 ... 0 13 8586 8655 1.57 3.80 6.50 12 15 H
0 0 1 1 3 1.500000 4 2.000000 1 6 5 ... 0 13 8654 8483 1.91 3.60 4.00 12 8 D
0 1 0 2 3 1.000000 2 0.666667 4 4 4 ... 0 13 10252 10260 4.50 3.60 1.80 13 12 D
0 2 0 1 4 1.333333 1 0.333333 3 3 6 ... 0 13 8668 9825 3.40 3.30 2.20 13 8 A
0 1 1 1 8 2.666667 5 1.666667 5 5 2 ... 0 13 10261 9879 1.91 3.40 4.20 12 12 D
0 0 1 1 0 0.000000 3 1.500000 6 3 3 ... 0 13 8456 8658 1.40 4.50 8.50 8 12 D
0 1 1 0 4 2.000000 4 2.000000 4 7 1 ... 0 13 10194 8650 3.75 3.25 2.10 12 8 H
0 0 0 2 2 1.000000 2 1.000000 2 5 5 ... 0 13 8528 8659 2.50 3.25 2.88 13 13 H
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
0 0 2 0 0 0.000000 2 1.000000 6 16 11 ... 5 34 10238 6403 2.30 3.50 2.90 8 5 D
0 0 1 1 1 0.500000 3 1.500000 9 13 11 ... 5 34 9807 7842 3.10 3.50 2.20 6 5 H
0 2 0 0 3 1.500000 1 0.500000 28 4 1 ... 5 34 9772 10214 1.10 9.00 21.00 3 6 H
0 2 0 1 7 2.333333 3 1.000000 16 7 10 ... 5 34 10264 9768 7.00 4.33 1.44 8 7 A
0 1 1 0 2 1.000000 3 1.500000 2 1 2 ... 5 6 158085 9807 2.40 3.00 3.20 6 6 D
0 1 1 0 1 0.500000 1 0.500000 2 2 1 ... 5 6 8613 9768 9.50 5.00 1.33 8 7 D
0 0 1 1 0 0.000000 2 1.000000 0 4 1 ... 5 6 8348 9773 15.00 5.50 1.22 7 6 D
0 2 0 0 3 1.500000 0 0.000000 2 3 0 ... 5 6 10214 10238 1.91 3.40 4.00 7 6 D
0 1 1 0 2 1.000000 2 1.000000 2 1 2 ... 5 6 7841 10215 1.62 3.60 6.00 2 7 H
0 1 0 1 2 1.000000 1 0.500000 1 1 3 ... 5 6 7844 10264 3.50 3.00 2.25 6 7 A
0 0 3 0 1 0.333333 9 3.000000 0 6 0 ... 5 7 10215 10212 4.00 3.10 2.05 7 6 H
0 0 1 1 1 0.500000 4 2.000000 2 2 2 ... 5 7 6403 10214 2.20 3.50 3.10 7 6 H
0 2 0 0 9 4.500000 1 0.500000 4 2 0 ... 5 7 10264 158085 1.53 4.00 6.00 8 6 D
0 2 0 0 3 1.500000 0 0.000000 4 0 2 ... 5 7 9773 9807 1.14 7.00 21.00 7 6 H
0 2 1 0 3 1.000000 2 0.666667 3 1 2 ... 5 7 7841 8613 2.05 3.30 3.60 7 6 H
0 0 0 2 4 2.000000 4 2.000000 1 1 4 ... 5 7 10238 7842 2.50 3.20 2.88 8 6 H
0 1 0 1 2 1.000000 1 0.500000 4 0 2 ... 5 7 9768 7844 1.29 5.50 10.00 8 6 H
0 3 0 0 12 4.000000 2 0.666667 4 2 1 ... 5 8 9772 9768 2.20 3.20 3.40 8 7 A
0 2 0 0 3 1.500000 1 0.500000 4 3 0 ... 5 8 7842 7841 2.20 3.10 3.50 7 6 D
0 0 0 2 2 1.000000 2 1.000000 0 4 3 ... 5 8 8348 10238 2.30 3.20 3.20 8 7 A
0 1 1 0 1 0.500000 1 0.500000 1 3 3 ... 5 8 7844 10215 1.70 3.60 5.00 7 5 D
0 1 0 1 3 1.500000 1 0.500000 2 4 1 ... 5 8 10214 8613 1.91 3.25 4.33 7 5 D
0 3 0 0 7 2.333333 0 0.000000 5 0 2 ... 5 8 9773 10264 1.33 4.75 9.50 7 7 D
0 2 0 0 6 3.000000 2 1.000000 2 3 2 ... 5 8 10212 6403 1.91 3.40 4.00 7 6 A
0 0 1 1 0 0.000000 1 0.500000 2 3 3 ... 5 9 8613 10212 2.70 3.20 2.60 7 6 A
0 1 1 0 3 1.500000 4 2.000000 4 2 2 ... 5 9 6403 7844 2.15 3.20 3.50 6 5 A
0 1 0 1 3 1.500000 2 1.000000 3 1 4 ... 5 9 10238 158085 2.10 3.10 3.75 8 5 D
0 1 0 1 5 2.500000 1 0.500000 4 2 2 ... 5 9 10264 9807 1.50 4.00 6.50 8 5 H
0 2 0 0 6 3.000000 1 0.500000 6 0 2 ... 5 9 9768 7842 1.20 6.00 15.00 3 5 H
0 2 1 0 3 1.000000 2 0.666667 4 1 3 ... 5 9 7841 10214 2.30 3.10 3.30 6 6 H

8791 rows × 97 columns


In [ ]:


In [5]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing 
le = preprocessing.LabelEncoder()

scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='log_loss')
print scores.mean()


/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
-0.981429559041
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)

In [9]:
pure_data = pd.read_csv('pure_data.csv',index_col='Unnamed: 0')
pure_formation = pd.read_csv('pure_formation.csv',index_col='Unnamed: 0')

ft_pure_data=pure_data.columns.values[0:len(pure_data.columns.values)-1]
ft_pure_formation=['country_id', 'league_id', 'season', 'stage', 'date',
       'match_api_id', 'home_team_api_id', 'away_team_api_id', 'B365H',
       'B365D', 'B365A', 'formation_h', 'formation_a']
pure_formation['formation_a'] = le.fit_transform(pure_formation['formation_a'].astype('str')) 
pure_formation['formation_h'] = le.fit_transform(pure_formation['formation_h'].astype('str'))

In [7]:
print pure_data.shape
print pure_formation.shape


(19685, 56)
(19685, 14)

In [8]:
scores = cross_val_score(LogisticRegression(), pure_data[ft_pure_data], pure_data['Output'], cv=10, scoring='neg_log_loss')
print "Pure Data"
print scores.mean()


Pure Data
-1.06908234659

In [8]:
print ft_pure_formation
scores = cross_val_score(LogisticRegression(), pure_formation[ft_pure_formation], pure_formation['Output'], cv=10, scoring='neg_log_loss')
print "Pure Formation"
print scores.mean()


['country_id', 'league_id', 'season', 'stage', 'date', 'match_api_id', 'home_team_api_id', 'away_team_api_id', 'B365H', 'B365D', 'B365A', 'formation_h', 'formation_a']
Pure Formation
-1.06923334431

In [10]:
# Dados com feature Engineering total
data_min = pd.DataFrame()
data_mean = pd.DataFrame()
for column in ft:
    data_min[column]=(data[column] - data[column].min()) / (data[column].max()-data[column].min())
for column in ft:
    data_mean[column]=(data[column] - data[column].mean()) / data[column].std()

    
scores = cross_val_score(LogisticRegression(), data_min[ft], data['Output'], cv=10, scoring='neg_log_loss')
print "MinMax"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data_mean[ft], data['Output'], cv=10, scoring='neg_log_loss')
print "Mean"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
print "Normal"
print scores.mean()


MinMax
-0.987484303268
Mean
-0.991968851075
Normal
-0.981429559041

In [ ]:
#Classificador mais simple possivel
print data['Output'].value_counts()

print 4042./data.shape[0]


H    4042
A    2528
D    2221
Name: Output, dtype: int64
0.459788419975

In [ ]:
scores = cross_val_score(LogisticRegression(), data_min[ft], data['Output'], cv=10, scoring='log_loss')
print "MinMax"
print scores.mean(),scores.std()
scores = cross_val_score(LogisticRegression(), data_mean[ft], data['Output'], cv=10, scoring='log_loss')
print "Mean"
print scores.mean(),scores.std()
scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='log_loss')
print "Normal"
print scores.mean(),scores.std()


/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
MinMax
-0.987484303268 0.025794254341
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
Mean
-0.991968851075 0.0279115513172
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)
/home/felipef/anaconda2/envs/gl-env/lib/python2.7/site-packages/sklearn/metrics/scorer.py:127: DeprecationWarning: Scoring method log_loss was renamed to neg_log_loss in version 0.18 and will be removed in 0.20.
  sample_weight=sample_weight)

In [10]:
from sklearn.neighbors import KNeighborsClassifier
k_range=range(50,200)
k_score=[]
for k in k_range:
    scores = cross_val_score((KNeighborsClassifier(n_neighbors=k)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
    k_score.append(scores.mean())

In [11]:
plt.scatter(k_range,k_score)
plt.show()



In [14]:
scores = cross_val_score((KNeighborsClassifier(n_neighbors=100)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
print scores.mean()


-1.09027134044

In [ ]:


In [16]:
from  sklearn.ensemble import RandomForestClassifier

k_range=[300,500,800,1000,1300,1500,1800,2000,3000,4000,5000]
k_score=[]
for k in k_range:
    scores = cross_val_score((RandomForestClassifier(n_estimators=k, max_depth=3, n_jobs=-1)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
    k_score.append(scores.mean())

In [ ]:
print len(k_score)

In [ ]:
print len(k_score)
plt.scatter([300,500,800,1000,1300,1500,1800,2000,3000,4000,5000],k_score)
plt.show()

In [ ]:
from  sklearn.ensemble import GradientBoostingClassifier

k_range=[1000,2000,3000,4000]
k_score=[]
for k in k_range:
    scores = cross_val_score((GradientBoostingClassifier(n_estimators=k)), data[ft], data['Output'], cv=5, scoring='accuracy')
    k_score.append(scores.mean())

In [ ]:
len(k_score)
plt.scatter([1000,2000],k_score)
plt.show()

In [ ]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

dt = SelectKBest(chi2, k=48).fit_transform(data[ft],data['Output'])
print dt.shape
print data.shape
print dt[0]

scores = cross_val_score(LogisticRegression(), dt, data['Output'], cv=10, scoring='log_loss')
print scores.mean()

In [ ]:


In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[ft],data['Output'],test_size=0.33)

lm = LogisticRegression()
lm.fit(X_train,y_train)
prob= lm.predict_proba(X_test)
pre =  lm.predict(X_test)
print prob[0],pre[0]
print pre

print lm.score(X_test,y_test)

In [ ]:
data.columns.values

In [11]:
play_style = ['h_buildUpPlaySpeed',
       'h_buildUpPlaySpeedClass', 'h_buildUpPlayDribblingClass',
       'h_buildUpPlayPassing', 'h_buildUpPlayPassingClass',
       'h_buildUpPlayPositioningClass', 'h_chanceCreationPassing',
       'h_chanceCreationPassingClass', 'h_chanceCreationCrossing',
       'h_chanceCreationCrossingClass', 'h_chanceCreationShooting',
       'h_chanceCreationShootingClass', 'h_chanceCreationPositioningClass',
       'h_defencePressure', 'h_defencePressureClass',
       'h_defenceAggression', 'h_defenceAggressionClass',
       'h_defenceTeamWidth', 'h_defenceTeamWidthClass',
       'h_defenceDefenderLineClass', 'a_buildUpPlaySpeed',
       'a_buildUpPlaySpeedClass', 'a_buildUpPlayDribblingClass',
       'a_buildUpPlayPassing', 'a_buildUpPlayPassingClass',
       'a_buildUpPlayPositioningClass', 'a_chanceCreationPassing',
       'a_chanceCreationPassingClass', 'a_chanceCreationCrossing',
       'a_chanceCreationCrossingClass', 'a_chanceCreationShooting',
       'a_chanceCreationShootingClass', 'a_chanceCreationPositioningClass',
       'a_defencePressure', 'a_defencePressureClass',
       'a_defenceAggression', 'a_defenceAggressionClass',
       'a_defenceTeamWidth', 'a_defenceTeamWidthClass',
       'a_defenceDefenderLineClass']

fifa_ratings=['h_avg_height', 'h_avg_weight',
       'a_avg_height', 'a_avg_weight', 'h_overall', 'h_potential', 'h_def',
       'h_mid', 'h_att', 'a_overall', 'a_potential', 'a_def', 'a_mid',
       'a_att']
base= ['season', 'stage','league_id','a_date', 'h_date','B365H',
       'B365D', 'B365A', 'formation_h', 'formation_a']

goals = ['home_V', 'home_D', 'home_E', 'home_GF', 'home_AVG_GF', 'home_GS',
       'home_AVG_GS', 'home_VG', 'home_DG', 'home_EG', 'home_GFG',
       'home_AVG_GFG', 'home_GSG', 'home_AVG_GSG', 'away_V', 'away_D',
       'away_E', 'away_GF', 'away_AVG_GF', 'away_GS', 'away_AVG_GS',
       'away_VG', 'away_DG', 'away_EG', 'away_GFG', 'away_AVG_GFG',
       'away_GSG']

print goals+base


['home_V', 'home_D', 'home_E', 'home_GF', 'home_AVG_GF', 'home_GS', 'home_AVG_GS', 'home_VG', 'home_DG', 'home_EG', 'home_GFG', 'home_AVG_GFG', 'home_GSG', 'home_AVG_GSG', 'away_V', 'away_D', 'away_E', 'away_GF', 'away_AVG_GF', 'away_GS', 'away_AVG_GS', 'away_VG', 'away_DG', 'away_EG', 'away_GFG', 'away_AVG_GFG', 'away_GSG', 'season', 'stage', 'league_id', 'a_date', 'h_date', 'B365H', 'B365D', 'B365A', 'formation_h', 'formation_a']

In [ ]:
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+goals+base], data['Output'], cv=10, scoring='log_loss')
print scores.mean()

In [12]:
from  sklearn.ensemble import RandomForestClassifier
scores = cross_val_score((RandomForestClassifier(n_estimators=1000, max_depth=3)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
print scores.mean()


-0.984984243937

In [ ]:
k_range=[1000,2000,3000,4000,5000]
k_score=[]
for k in k_range:
    scores = cross_val_score((RandomForestClassifier(n_estimators=k, max_depth=3)), data[fifa_ratings+goals+base], data['Output'], cv=10, scoring='accuracy')
    k_score.append(scores.mean())

In [ ]:
print k_score

In [15]:
scores = cross_val_score(LogisticRegression(), data[base], data['Output'], cv=10, scoring='neg_log_loss')
print "Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals+Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings+Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+play_style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+goals+play_style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+goals+play_style"
print scores.mean()


Base
-0.973562814441
Ratings
-0.984979257755
Goals
-1.03314259493
Style
-1.06146631889
Base+goals
-0.974094074696
Base+Ratings
-0.97333685394
Base+Style
-0.979329914618
Rating+Goals
-0.986844484151
Goals+Style
-1.03907113122
Ratings+Style
-0.998159144845
base+Rating+Goals
-0.974709147966
base+Rating+play_style
-0.980211581839
base+goals+play_style
-0.981671730394
Rating+goals+play_style
-1.0006185998

In [17]:
scores = cross_val_score(RandomForestClassifier(n_estimators=1000), data[play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[base+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals+Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings+Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[fifa_ratings+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+goals+play_style"
print scores.mean()


Style
-1.08837422037
Base+Style
-1.00342865589
Goals+Style
-1.04459693463
Ratings+Style
-1.00359251103
Rating+goals+play_style
-0.996481363352

In [ ]:


In [16]:
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base], data['Output'], cv=10, scoring='neg_log_loss')
print "Base"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Ratings"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+Goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals+Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings+Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+Goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+play_style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+goals+play_style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+goals+play_style"
print scores.mean()


Base
-1.03659004831
Ratings
-0.994091173579
Goals
-1.03785982671
Style
-1.06445276755
Base+goals
-1.04955993303
Base+Ratings
-1.00901515738
Base+Style
-1.07038982999
Rating+Goals
-0.992641108592
Goals+Style
-1.04071347198
Ratings+Style
-1.02397472781
base+Rating+Goals
-1.02632818715
base+Rating+play_style
-1.04787193518
base+goals+play_style
-1.0604808121
Rating+goals+play_style
-1.01855727069

In [ ]:
print uk['Output'].value_counts()[0]/(uk.shape[0]*1.)
print de['Output'].value_counts()[0]/(de.shape[0]*1.)
print fr['Output'].value_counts()[0]/(fr.shape[0]*1.)
print it['Output'].value_counts()[0]/(it.shape[0]*1.)
print pt['Output'].value_counts()[0]/(pt.shape[0]*1.)
print es['Output'].value_counts()[0]/(es.shape[0]*1.)

In [ ]:
scores = cross_val_score(LogisticRegression(), uk[base], uk['Output'], cv=10, scoring='log_loss')
print "Base"
print scores.mean()

In [ ]:
mandante_forte = data[data['h_overall']>= (data['a_overall']+3)] 
visitante_forte = data[data['a_overall']>= (data['h_overall']+3)] 
iguais = data[abs(data['h_overall'] -(data['a_overall'])) < 3 ] 

print mandante_forte['Output'].value_counts()[0]/(mandante_forte.shape[0]*1.)
print visitante_forte['Output'].value_counts()[0]/(visitante_forte.shape[0]*1.)
print iguais['Output'].value_counts()[0]/(iguais.shape[0]*1.)

In [ ]:
print mandante_forte.shape[0]+visitante_forte.shape[0]+ iguais.shape[0]

print uk.shape[0]+de.shape[0]+es.shape[0]+it.shape[0]+fr.shape[0]+pt.shape[0]

In [18]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[ft],data['Output'], test_size=0.33)

treino= pd.concat([X_train,y_train],axis=1)
mandante_forte = treino[treino['h_overall']>= (treino['a_overall']+3)] 
visitante_forte = treino[treino['a_overall']>= (treino['h_overall']+3)] 
iguais = treino[abs(treino['h_overall'] -(treino['a_overall'])) < 3 ]

print mandante_forte['Output'].value_counts()[0]/(mandante_forte.shape[0]*1.)
print visitante_forte['Output'].value_counts()[0]/(visitante_forte.shape[0]*1.)
print iguais['Output'].value_counts()[0]/(iguais.shape[0]*1.)


0.66847826087
0.499057196732
0.460635881908

In [19]:
print "Mandante Forte"

scores = cross_val_score(LogisticRegression(), mandante_forte[base], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), mandante_forte[base+goals], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base-Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), mandante_forte[base+fifa_ratings], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), mandante_forte[base+goals+fifa_ratings], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+goals+ratings"
print scores.mean()

scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base+goals] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base-Goals"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base+fifa_ratings] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base+goals+fifa_ratings] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+goals+ratings"
print scores.mean()


Mandante Forte
Regressão Logistica - Base
-0.814989731478
Regressão Logistica - Base-Goals
-0.816478929558
Regressão Logistica - Base+ratings
-0.820619992537
Regressão Logistica - Base+goals+ratings
-0.823411274004
Random - Base
-0.852166724971
Random - Base-Goals
-0.837783283072
Random - Base+ratings
-0.841615729019
Random - Base+goals+ratings
-0.834640409641

In [43]:
print len(k_score)

plt.scatter([100,500,1000,2000],k_score)
plt.show()


4

In [20]:
print "visitante Forte"

scores = cross_val_score(LogisticRegression(), visitante_forte[base], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), visitante_forte[base+goals], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base-Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), visitante_forte[base+fifa_ratings], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), visitante_forte[base+goals+fifa_ratings], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+goals+ratings"
print scores.mean()

scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base+goals] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base-Goals"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base+fifa_ratings] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base+goals+fifa_ratings] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+goals+ratings"
print scores.mean()


visitante Forte
Regressão Logistica - Base
-1.0036004086
Regressão Logistica - Base-Goals
-1.01224050007
Regressão Logistica - Base+ratings
-1.01356365601
Regressão Logistica - Base+goals+ratings
-1.02250358003
Random - Base
-1.07802630191
Random - Base-Goals
-1.03579179506
Random - Base+ratings
-1.041927169
Random - Base+goals+ratings
-1.02999456242

In [22]:
print "iguais Forte"

scores = cross_val_score(LogisticRegression(), iguais[base], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), iguais[base+goals], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base-Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), iguais[base+fifa_ratings], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), iguais[base+goals+fifa_ratings], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+goals+ratings"
print scores.mean()

scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base+goals] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base-Goals"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base+fifa_ratings] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base+goals+fifa_ratings] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+goals+ratings"
print scores.mean()


iguais Forte
Regressão Logistica - Base
-1.05508327122
Regressão Logistica - Base-Goals
-1.05845467839
Regressão Logistica - Base+ratings
-1.05967056878
Regressão Logistica - Base+goals+ratings
-1.06414120982
Random - Base
-1.10259356201
Random - Base-Goals
-1.07685361898
Random - Base+ratings
-1.07201231689
Random - Base+goals+ratings
-1.06899597774

In [23]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[ft],data['Output'], test_size=0.33)

treino= pd.concat([X_train,y_train],axis=1)
mandante_forte = treino[treino['h_overall']>= (treino['a_overall']+3)] 
visitante_forte = treino[treino['a_overall']>= (treino['h_overall']+3)] 
iguais = treino[abs(treino['h_overall'] -(treino['a_overall'])) < 3 ]

print mandante_forte['Output'].value_counts()[0]/(mandante_forte.shape[0]*1.)
print visitante_forte['Output'].value_counts()[0]/(visitante_forte.shape[0]*1.)
print iguais['Output'].value_counts()[0]/(iguais.shape[0]*1.)


0.666666666667
0.509194673431
0.452480121166

In [24]:
reg_H = LogisticRegression()
reg_H.fit(mandante_forte[base],mandante_forte['Output'])
reg_A = LogisticRegression()
reg_A.fit(visitante_forte[base],visitante_forte['Output'])
reg_I = LogisticRegression()
reg_I.fit(iguais[base],iguais['Output'])


Out[24]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [25]:
teste= pd.concat([X_train,y_train],axis=1)
m = treino[teste['h_overall']>= (teste['a_overall']+3)] 
v = treino[teste['a_overall']>= (teste['h_overall']+3)] 
i = treino[abs(teste['h_overall'] -(teste['a_overall'])) < 3 ]

In [32]:
print reg_H.predict_log_proba(m[base]).mean()
print reg_A.predict_log_proba(v[base]).mean()
print reg_I.predict_log_proba(i[base]).mean()
print (-1.49989713783 + -1.21385058598  + -1.14233720246)/3


-1.49989713783
-1.21385058598
-1.14233720246
-1.28536164209

In [35]:
scores = cross_val_score(LogisticRegression(), data[base], data['Output'], cv=10, scoring='accuracy')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='accuracy')
print "Regressão Logistica - Base"
print scores.mean()


Regressão Logistica - Base
0.532708042354
Regressão Logistica - Base
0.522581341142