In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3
import numpy as np
from numpy import random
database = 'database.sqlite'
conn = sqlite3.connect(database)
In [2]:
uk = pd.read_csv('./leagues/1729.csv',index_col='Unnamed: 0')
de = pd.read_csv('./leagues/7809.csv',index_col='Unnamed: 0')
fr = pd.read_csv('./leagues/4769.csv',index_col='Unnamed: 0')
it = pd.read_csv('./leagues/10257.csv',index_col='Unnamed: 0')
es = pd.read_csv('./leagues/21518.csv',index_col='Unnamed: 0')
pt = pd.read_csv('./leagues/17642.csv',index_col='Unnamed: 0')
data= pd.concat([uk,de])
data1= pd.concat([es,fr])
data= pd.concat([data,it])
data1= pd.concat([data1,pt])
data= pd.concat([data,data1])
In [3]:
ft = ['home_V','home_D','home_E','home_GF','home_AVG_GF','home_GS','home_AVG_GS','home_VG','home_DG','home_EG','home_GFG','home_AVG_GFG','home_GSG','home_AVG_GSG','away_V','away_D','away_E','away_GF','away_AVG_GF','away_GS','away_AVG_GS','away_VG','away_DG','away_EG','away_GFG','away_AVG_GFG','away_GSG','away_AVG_GSG','h_buildUpPlaySpeed','h_buildUpPlaySpeedClass','h_buildUpPlayDribblingClass','h_buildUpPlayPassing','h_buildUpPlayPassingClass','h_buildUpPlayPositioningClass','h_chanceCreationPassing','h_chanceCreationPassingClass','h_chanceCreationCrossing','h_chanceCreationCrossingClass','h_chanceCreationShooting','h_chanceCreationShootingClass','h_chanceCreationPositioningClass','h_defencePressure','h_defencePressureClass','h_defenceAggression','h_defenceAggressionClass','h_defenceTeamWidth','h_defenceTeamWidthClass','h_defenceDefenderLineClass','a_buildUpPlaySpeed','a_buildUpPlaySpeedClass','a_buildUpPlayDribblingClass','a_buildUpPlayPassing','a_buildUpPlayPassingClass','a_buildUpPlayPositioningClass','a_chanceCreationPassing','a_chanceCreationPassingClass','a_chanceCreationCrossing','a_chanceCreationCrossingClass','a_chanceCreationShooting','a_chanceCreationShootingClass','a_chanceCreationPositioningClass','a_defencePressure','a_defencePressureClass','a_defenceAggression','a_defenceAggressionClass','a_defenceTeamWidth','a_defenceTeamWidthClass','a_defenceDefenderLineClass','h_avg_height','h_avg_weight','a_avg_height','a_avg_weight','h_overall','h_potential','h_def','h_mid','h_att','a_overall','a_potential','a_def','a_mid','a_att','a_date','h_date','id','season','stage','home_team_api_id','away_team_api_id','B365H','B365D','B365A','formation_h','formation_a','league_id']
#for column in ft:
# data[column]=(data[column] - data[column].mean()) / data[column].std()
In [4]:
data
Out[4]:
In [ ]:
In [5]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='log_loss')
print scores.mean()
In [9]:
pure_data = pd.read_csv('pure_data.csv',index_col='Unnamed: 0')
pure_formation = pd.read_csv('pure_formation.csv',index_col='Unnamed: 0')
ft_pure_data=pure_data.columns.values[0:len(pure_data.columns.values)-1]
ft_pure_formation=['country_id', 'league_id', 'season', 'stage', 'date',
'match_api_id', 'home_team_api_id', 'away_team_api_id', 'B365H',
'B365D', 'B365A', 'formation_h', 'formation_a']
pure_formation['formation_a'] = le.fit_transform(pure_formation['formation_a'].astype('str'))
pure_formation['formation_h'] = le.fit_transform(pure_formation['formation_h'].astype('str'))
In [7]:
print pure_data.shape
print pure_formation.shape
In [8]:
scores = cross_val_score(LogisticRegression(), pure_data[ft_pure_data], pure_data['Output'], cv=10, scoring='neg_log_loss')
print "Pure Data"
print scores.mean()
In [8]:
print ft_pure_formation
scores = cross_val_score(LogisticRegression(), pure_formation[ft_pure_formation], pure_formation['Output'], cv=10, scoring='neg_log_loss')
print "Pure Formation"
print scores.mean()
In [10]:
# Dados com feature Engineering total
data_min = pd.DataFrame()
data_mean = pd.DataFrame()
for column in ft:
data_min[column]=(data[column] - data[column].min()) / (data[column].max()-data[column].min())
for column in ft:
data_mean[column]=(data[column] - data[column].mean()) / data[column].std()
scores = cross_val_score(LogisticRegression(), data_min[ft], data['Output'], cv=10, scoring='neg_log_loss')
print "MinMax"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data_mean[ft], data['Output'], cv=10, scoring='neg_log_loss')
print "Mean"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
print "Normal"
print scores.mean()
In [ ]:
#Classificador mais simple possivel
print data['Output'].value_counts()
print 4042./data.shape[0]
In [ ]:
scores = cross_val_score(LogisticRegression(), data_min[ft], data['Output'], cv=10, scoring='log_loss')
print "MinMax"
print scores.mean(),scores.std()
scores = cross_val_score(LogisticRegression(), data_mean[ft], data['Output'], cv=10, scoring='log_loss')
print "Mean"
print scores.mean(),scores.std()
scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='log_loss')
print "Normal"
print scores.mean(),scores.std()
In [10]:
from sklearn.neighbors import KNeighborsClassifier
k_range=range(50,200)
k_score=[]
for k in k_range:
scores = cross_val_score((KNeighborsClassifier(n_neighbors=k)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
k_score.append(scores.mean())
In [11]:
plt.scatter(k_range,k_score)
plt.show()
In [14]:
scores = cross_val_score((KNeighborsClassifier(n_neighbors=100)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
print scores.mean()
In [ ]:
In [16]:
from sklearn.ensemble import RandomForestClassifier
k_range=[300,500,800,1000,1300,1500,1800,2000,3000,4000,5000]
k_score=[]
for k in k_range:
scores = cross_val_score((RandomForestClassifier(n_estimators=k, max_depth=3, n_jobs=-1)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
k_score.append(scores.mean())
In [ ]:
print len(k_score)
In [ ]:
print len(k_score)
plt.scatter([300,500,800,1000,1300,1500,1800,2000,3000,4000,5000],k_score)
plt.show()
In [ ]:
from sklearn.ensemble import GradientBoostingClassifier
k_range=[1000,2000,3000,4000]
k_score=[]
for k in k_range:
scores = cross_val_score((GradientBoostingClassifier(n_estimators=k)), data[ft], data['Output'], cv=5, scoring='accuracy')
k_score.append(scores.mean())
In [ ]:
len(k_score)
plt.scatter([1000,2000],k_score)
plt.show()
In [ ]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
dt = SelectKBest(chi2, k=48).fit_transform(data[ft],data['Output'])
print dt.shape
print data.shape
print dt[0]
scores = cross_val_score(LogisticRegression(), dt, data['Output'], cv=10, scoring='log_loss')
print scores.mean()
In [ ]:
In [ ]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[ft],data['Output'],test_size=0.33)
lm = LogisticRegression()
lm.fit(X_train,y_train)
prob= lm.predict_proba(X_test)
pre = lm.predict(X_test)
print prob[0],pre[0]
print pre
print lm.score(X_test,y_test)
In [ ]:
data.columns.values
In [11]:
play_style = ['h_buildUpPlaySpeed',
'h_buildUpPlaySpeedClass', 'h_buildUpPlayDribblingClass',
'h_buildUpPlayPassing', 'h_buildUpPlayPassingClass',
'h_buildUpPlayPositioningClass', 'h_chanceCreationPassing',
'h_chanceCreationPassingClass', 'h_chanceCreationCrossing',
'h_chanceCreationCrossingClass', 'h_chanceCreationShooting',
'h_chanceCreationShootingClass', 'h_chanceCreationPositioningClass',
'h_defencePressure', 'h_defencePressureClass',
'h_defenceAggression', 'h_defenceAggressionClass',
'h_defenceTeamWidth', 'h_defenceTeamWidthClass',
'h_defenceDefenderLineClass', 'a_buildUpPlaySpeed',
'a_buildUpPlaySpeedClass', 'a_buildUpPlayDribblingClass',
'a_buildUpPlayPassing', 'a_buildUpPlayPassingClass',
'a_buildUpPlayPositioningClass', 'a_chanceCreationPassing',
'a_chanceCreationPassingClass', 'a_chanceCreationCrossing',
'a_chanceCreationCrossingClass', 'a_chanceCreationShooting',
'a_chanceCreationShootingClass', 'a_chanceCreationPositioningClass',
'a_defencePressure', 'a_defencePressureClass',
'a_defenceAggression', 'a_defenceAggressionClass',
'a_defenceTeamWidth', 'a_defenceTeamWidthClass',
'a_defenceDefenderLineClass']
fifa_ratings=['h_avg_height', 'h_avg_weight',
'a_avg_height', 'a_avg_weight', 'h_overall', 'h_potential', 'h_def',
'h_mid', 'h_att', 'a_overall', 'a_potential', 'a_def', 'a_mid',
'a_att']
base= ['season', 'stage','league_id','a_date', 'h_date','B365H',
'B365D', 'B365A', 'formation_h', 'formation_a']
goals = ['home_V', 'home_D', 'home_E', 'home_GF', 'home_AVG_GF', 'home_GS',
'home_AVG_GS', 'home_VG', 'home_DG', 'home_EG', 'home_GFG',
'home_AVG_GFG', 'home_GSG', 'home_AVG_GSG', 'away_V', 'away_D',
'away_E', 'away_GF', 'away_AVG_GF', 'away_GS', 'away_AVG_GS',
'away_VG', 'away_DG', 'away_EG', 'away_GFG', 'away_AVG_GFG',
'away_GSG']
print goals+base
In [ ]:
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+goals+base], data['Output'], cv=10, scoring='log_loss')
print scores.mean()
In [12]:
from sklearn.ensemble import RandomForestClassifier
scores = cross_val_score((RandomForestClassifier(n_estimators=1000, max_depth=3)), data[ft], data['Output'], cv=10, scoring='neg_log_loss')
print scores.mean()
In [ ]:
k_range=[1000,2000,3000,4000,5000]
k_score=[]
for k in k_range:
scores = cross_val_score((RandomForestClassifier(n_estimators=k, max_depth=3)), data[fifa_ratings+goals+base], data['Output'], cv=10, scoring='accuracy')
k_score.append(scores.mean())
In [ ]:
print k_score
In [15]:
scores = cross_val_score(LogisticRegression(), data[base], data['Output'], cv=10, scoring='neg_log_loss')
print "Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals+Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings+Style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+play_style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[base+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+goals+play_style"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[fifa_ratings+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+goals+play_style"
print scores.mean()
In [17]:
scores = cross_val_score(RandomForestClassifier(n_estimators=1000), data[play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[base+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals+Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings+Style"
print scores.mean()
scores = cross_val_score(RandomForestClassifier(n_estimators=1000,n_jobs=4), data[fifa_ratings+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+goals+play_style"
print scores.mean()
In [ ]:
In [16]:
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base], data['Output'], cv=10, scoring='neg_log_loss')
print "Base"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+fifa_ratings], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Ratings"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Base+Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+Goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Goals+Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Ratings+Style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+fifa_ratings+goals], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+Goals"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+fifa_ratings+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+Rating+play_style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[base+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "base+goals+play_style"
print scores.mean()
scores = cross_val_score(KNeighborsClassifier(n_neighbors=100), data[fifa_ratings+goals+play_style], data['Output'], cv=10, scoring='neg_log_loss')
print "Rating+goals+play_style"
print scores.mean()
In [ ]:
print uk['Output'].value_counts()[0]/(uk.shape[0]*1.)
print de['Output'].value_counts()[0]/(de.shape[0]*1.)
print fr['Output'].value_counts()[0]/(fr.shape[0]*1.)
print it['Output'].value_counts()[0]/(it.shape[0]*1.)
print pt['Output'].value_counts()[0]/(pt.shape[0]*1.)
print es['Output'].value_counts()[0]/(es.shape[0]*1.)
In [ ]:
scores = cross_val_score(LogisticRegression(), uk[base], uk['Output'], cv=10, scoring='log_loss')
print "Base"
print scores.mean()
In [ ]:
mandante_forte = data[data['h_overall']>= (data['a_overall']+3)]
visitante_forte = data[data['a_overall']>= (data['h_overall']+3)]
iguais = data[abs(data['h_overall'] -(data['a_overall'])) < 3 ]
print mandante_forte['Output'].value_counts()[0]/(mandante_forte.shape[0]*1.)
print visitante_forte['Output'].value_counts()[0]/(visitante_forte.shape[0]*1.)
print iguais['Output'].value_counts()[0]/(iguais.shape[0]*1.)
In [ ]:
print mandante_forte.shape[0]+visitante_forte.shape[0]+ iguais.shape[0]
print uk.shape[0]+de.shape[0]+es.shape[0]+it.shape[0]+fr.shape[0]+pt.shape[0]
In [18]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[ft],data['Output'], test_size=0.33)
treino= pd.concat([X_train,y_train],axis=1)
mandante_forte = treino[treino['h_overall']>= (treino['a_overall']+3)]
visitante_forte = treino[treino['a_overall']>= (treino['h_overall']+3)]
iguais = treino[abs(treino['h_overall'] -(treino['a_overall'])) < 3 ]
print mandante_forte['Output'].value_counts()[0]/(mandante_forte.shape[0]*1.)
print visitante_forte['Output'].value_counts()[0]/(visitante_forte.shape[0]*1.)
print iguais['Output'].value_counts()[0]/(iguais.shape[0]*1.)
In [19]:
print "Mandante Forte"
scores = cross_val_score(LogisticRegression(), mandante_forte[base], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), mandante_forte[base+goals], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base-Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), mandante_forte[base+fifa_ratings], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), mandante_forte[base+goals+fifa_ratings], mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+goals+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base+goals] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base-Goals"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base+fifa_ratings] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),mandante_forte[base+goals+fifa_ratings] , mandante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+goals+ratings"
print scores.mean()
In [43]:
print len(k_score)
plt.scatter([100,500,1000,2000],k_score)
plt.show()
In [20]:
print "visitante Forte"
scores = cross_val_score(LogisticRegression(), visitante_forte[base], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), visitante_forte[base+goals], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base-Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), visitante_forte[base+fifa_ratings], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), visitante_forte[base+goals+fifa_ratings], visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+goals+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base+goals] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base-Goals"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base+fifa_ratings] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),visitante_forte[base+goals+fifa_ratings] , visitante_forte['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+goals+ratings"
print scores.mean()
In [22]:
print "iguais Forte"
scores = cross_val_score(LogisticRegression(), iguais[base], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), iguais[base+goals], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base-Goals"
print scores.mean()
scores = cross_val_score(LogisticRegression(), iguais[base+fifa_ratings], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+ratings"
print scores.mean()
scores = cross_val_score(LogisticRegression(), iguais[base+goals+fifa_ratings], iguais['Output'], cv=10, scoring='neg_log_loss')
print "Regressão Logistica - Base+goals+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base+goals] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base-Goals"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base+fifa_ratings] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+ratings"
print scores.mean()
scores = cross_val_score((RandomForestClassifier(n_estimators=1000,n_jobs=4)),iguais[base+goals+fifa_ratings] , iguais['Output'], cv=10, scoring='neg_log_loss')
print "Random - Base+goals+ratings"
print scores.mean()
In [23]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data[ft],data['Output'], test_size=0.33)
treino= pd.concat([X_train,y_train],axis=1)
mandante_forte = treino[treino['h_overall']>= (treino['a_overall']+3)]
visitante_forte = treino[treino['a_overall']>= (treino['h_overall']+3)]
iguais = treino[abs(treino['h_overall'] -(treino['a_overall'])) < 3 ]
print mandante_forte['Output'].value_counts()[0]/(mandante_forte.shape[0]*1.)
print visitante_forte['Output'].value_counts()[0]/(visitante_forte.shape[0]*1.)
print iguais['Output'].value_counts()[0]/(iguais.shape[0]*1.)
In [24]:
reg_H = LogisticRegression()
reg_H.fit(mandante_forte[base],mandante_forte['Output'])
reg_A = LogisticRegression()
reg_A.fit(visitante_forte[base],visitante_forte['Output'])
reg_I = LogisticRegression()
reg_I.fit(iguais[base],iguais['Output'])
Out[24]:
In [25]:
teste= pd.concat([X_train,y_train],axis=1)
m = treino[teste['h_overall']>= (teste['a_overall']+3)]
v = treino[teste['a_overall']>= (teste['h_overall']+3)]
i = treino[abs(teste['h_overall'] -(teste['a_overall'])) < 3 ]
In [32]:
print reg_H.predict_log_proba(m[base]).mean()
print reg_A.predict_log_proba(v[base]).mean()
print reg_I.predict_log_proba(i[base]).mean()
print (-1.49989713783 + -1.21385058598 + -1.14233720246)/3
In [35]:
scores = cross_val_score(LogisticRegression(), data[base], data['Output'], cv=10, scoring='accuracy')
print "Regressão Logistica - Base"
print scores.mean()
scores = cross_val_score(LogisticRegression(), data[ft], data['Output'], cv=10, scoring='accuracy')
print "Regressão Logistica - Base"
print scores.mean()