In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3
import numpy as np
from numpy import random
database = 'database.sqlite'
conn = sqlite3.connect(database)
In [2]:
query = """select * from match where home_player_X1 is not null and 'home_player_Y1' is not null
and home_player_X2 is not null and 'home_player_Y2' is not null
and home_player_X3 is not null and home_player_Y3 is not null
and home_player_X4 is not null and home_player_Y4 is not null
and home_player_X5 is not null and home_player_Y5 is not null
and home_player_X6 is not null and home_player_Y6 is not null
and home_player_X7 is not null and home_player_Y7 is not null
and home_player_X8 is not null and home_player_Y8 is not null
and home_player_X9 is not null and home_player_Y9 is not null
and home_player_X10 is not null and home_player_Y10 is not null
and home_player_X11 is not null and home_player_Y11 is not null
and away_player_X1 is not null and away_player_Y1 is not null
and away_player_X2 is not null and away_player_Y2 is not null
and away_player_X3 is not null and away_player_Y3 is not null
and away_player_X4 is not null and away_player_Y4 is not null
and away_player_X5 is not null and away_player_Y5 is not null
and away_player_X6 is not null and away_player_Y6 is not null
and away_player_X7 is not null and away_player_Y7 is not null
and away_player_X8 is not null and away_player_Y8 is not null
and away_player_X9 is not null and away_player_Y9 is not null
and away_player_X10 is not null and away_player_Y10 is not null
and away_player_X11 is not null and away_player_Y11 is not null
and home_team_goal is not null and away_team_goal is not null
and home_player_1 is not null
and home_player_2 is not null
and home_player_3 is not null
and home_player_4 is not null
and home_player_5 is not null
and home_player_6 is not null
and home_player_7 is not null
and home_player_8 is not null
and home_player_9 is not null
and home_player_10 is not null
and home_player_11 is not null
and away_player_1 is not null
and away_player_2 is not null
and away_player_3 is not null
and away_player_4 is not null
and away_player_5 is not null
and away_player_6 is not null
and away_player_7 is not null
and away_player_8 is not null
and away_player_9 is not null
and away_player_10 is not null
and away_player_11 is not null
and B365H is not null and B365D is not null and B365A is not null;"""
matches = pd.read_sql(query, conn)
drop = matches.columns.values[-27:-1]
#Removing other betting houses odds
matches = matches.drop(drop,1)
matches= matches.drop('BSA',1)
#Raw features
matches= matches.drop(['goal', 'shoton', 'shotoff', 'foulcommit', 'card',
'cross', 'corner', 'possession'],1)
matches.columns.values
Out[2]:
In [3]:
#Transforming date column into a date type
#matches['date']=pd.to_datetime(matches['date'], format='%Y-%m-%d %H:%M:%S.%f')
#
#matches['date']=matches['date']-matches['date'].unique()[0]
#matches['date']=matches['date'].astype('timedelta64[D]')
#from sklearn import preprocessing
#le = preprocessing.LabelEncoder()
#matches['season'] = le.fit_transform(matches['season'].astype('str'))
In [4]:
import re
def def_formations(matches_positions):
#pos=matches_positions.to_dataframe()
form=[]
for index,row in matches_positions.iterrows():
b= row.values
dfs = (b <= 3).sum()
mid1 = ((b >= 4) & (b<=6)).sum()
mid2 = ((b >= 7) & (b<=9)).sum()
atk1 = ((b >= 10)).sum()
formation="%d-%d-%d-%d"%(dfs,mid1,mid2,atk1)
formation = re.sub('0-','',formation)
form.append(formation)
return form
In [5]:
positions_home= matches[['home_player_Y2',
'home_player_Y3',
'home_player_Y4',
'home_player_Y5',
'home_player_Y6',
'home_player_Y7',
'home_player_Y8',
'home_player_Y9',
'home_player_Y10',
'home_player_Y11']]
positions_away = matches[['away_player_Y2',
'away_player_Y3',
'away_player_Y4',
'away_player_Y5',
'away_player_Y6',
'away_player_Y7',
'away_player_Y8',
'away_player_Y9',
'away_player_Y10',
'away_player_Y11']]
formation_home=def_formations(positions_home)
formation_away=def_formations(positions_away)
matches['formation_h']=formation_home
matches['formation_a']=formation_away
matches['formation_a'].unique()
Out[5]:
In [6]:
matches= matches.drop(['home_player_X1', 'home_player_X2', 'home_player_X3',
'home_player_X4', 'home_player_X5', 'home_player_X6',
'home_player_X7', 'home_player_X8', 'home_player_X9',
'home_player_X10', 'home_player_X11', 'away_player_X1',
'away_player_X2', 'away_player_X3', 'away_player_X4',
'away_player_X5', 'away_player_X6', 'away_player_X7',
'away_player_X8', 'away_player_X9', 'away_player_X10',
'away_player_X11', 'home_player_Y1', 'home_player_Y2',
'home_player_Y3', 'home_player_Y4', 'home_player_Y5',
'home_player_Y6', 'home_player_Y7', 'home_player_Y8',
'home_player_Y9', 'home_player_Y10', 'home_player_Y11',
'away_player_Y1', 'away_player_Y2', 'away_player_Y3',
'away_player_Y4', 'away_player_Y5', 'away_player_Y6',
'away_player_Y7', 'away_player_Y8', 'away_player_Y9',
'away_player_Y10', 'away_player_Y11'],1)
In [7]:
def determine_home_result(match):
if match['home_team_goal'] > match['away_team_goal']:
return 'H'
elif match['home_team_goal'] < match['away_team_goal']:
return 'A'
else:
return 'D'
matches['Output']=matches.apply(determine_home_result, axis=1)
In [8]:
matches.columns.values
Out[8]:
In [9]:
data=matches
data = data[data['league_id']!= 0]
data = data[data['season']!= '2008/2009']
data[data['league_id'] == 1]['season'].unique()
data.sort_values(["country_id","league_id","season","stage"])
Out[9]:
In [20]:
def tabela_mandante(stageI, stageF, league_id, season, team_id):
sql = """select team, count(case when gf > gs then 1 end) home_V,
count(case when gs > gf then 1 end) home_D,
count(case when gf = gs then 1 end) home_E,
sum(gf) home_GF, avg(gf) as home_AVG_GF , sum(gs) as home_GS , avg(gs) as home_AVG_GS from
(select home_team_api_id team, home_team_goal gf, away_team_goal gs from Match
where (stage between """+str(stageI)+""" and """+str(stageF)+""" ) and (season='"""+str(season)+"""') and league_id ="""+str(league_id)+""" and home_team_api_id="""+str(team_id)+""") a
group by team order by home_V desc , home_GF-home_GS desc, home_GF desc;"""
return pd.read_sql(sql, conn)
def tabela_visitante(stageI, stageF, league_id, season, team_id):
sql = """select team as team, count(case when gf > gs then 1 end) away_V,
count(case when gs > gf then 1 end) away_D,
count(case when gf = gs then 1 end) away_E,
sum(gf) away_GF, avg(gf) as away_AVG_GF , sum(gs) as away_GS , avg(gs) as away_AVG_GS from
(select away_team_api_id team, away_team_goal gf, home_team_goal gs from Match
where (stage between """+str(stageI)+""" and """+str(stageF)+""" ) and (season='"""+str(season)+"""') and league_id ="""+str(league_id)+""" and away_team_api_id="""+str(team_id)+""") a
group by team order by away_V desc , away_GF-away_GS desc, away_GF desc;"""
return pd.read_sql(sql, conn)
def tabela(stageI, stageF, league_id, season, team_id,mando):
sql= """select team , count(case when gf > gs then 1 end) """+mando+"""_VG,
count(case when gs > gf then 1 end) """+mando+"""_DG,
count(case when gf = gs then 1 end) """+mando+"""_EG,
sum(gf) """+mando+"""_GFG, avg(gf) as """+mando+"""_AVG_GFG , sum(gs) as """+mando+"""_GSG , avg(gs) as """+mando+"""_AVG_GSG from
(select away_team_api_id as team, home_team_goal as gs , away_team_goal as gf from Match
where (stage between """+str(stageI)+""" and """+str(stageF)+""" ) and (season='"""+str(season)+"""') and league_id ="""+str(league_id)+""" and away_team_api_id="""+str(team_id)+"""
union all select home_team_api_id as team, away_team_goal as gs, home_team_goal as gf from Match
where (stage between """+str(stageI)+""" and """+str(stageF)+""" ) and (season='"""+str(season)+"""') and league_id ="""+str(league_id)+""" and home_team_api_id="""+str(team_id)+""") a group by team order by """+mando+"""_VG desc , """+mando+"""_GFG-"""+mando+"""_GSG desc, """+mando+"""_GFG desc;"""
return pd.read_sql(sql, conn)
def team_attribute(team_id,seasonI,seasonF):
sql=""" select buildUpPlaySpeed,buildUpPlaySpeedClass,buildUpPlayDribblingClass,buildUpPlayPassing,
buildUpPlayPassingClass,buildUpPlayPositioningClass,chanceCreationPassing,chanceCreationPassingClass,
chanceCreationCrossing,chanceCreationCrossingClass,chanceCreationShooting,chanceCreationShootingClass,
chanceCreationPositioningClass,defencePressure,defencePressureClass,defenceAggression,
defenceAggressionClass,defenceTeamWidth,defenceTeamWidthClass,defenceDefenderLineClass
from Team_Attributes where team_api_id ="""+str(team_id)+""" and date between '"""+str(seasonI)+"""' and '"""+str(seasonF)+"""' ;"""
return pd.read_sql(sql, conn)
def players_hw(player1,player2,player3,player4,player5,player6,player7,player8,player9,player10,player11):
sql="""select * from Player where player_api_id = """+str(player1)+""" or player_api_id="""+str(player2)+"""
or player_api_id="""+str(player3)+""" or player_api_id="""+str(player4)+""" or player_api_id="""+str(player5)+"""
or player_api_id="""+str(player6)+""" or player_api_id="""+str(player7)+""" or player_api_id="""+str(player8)+"""
or player_api_id="""+str(player9)+""" or player_api_id="""+str(player10)+""" or player_api_id="""+str(player11)+""";"""
return pd.read_sql(sql, conn)
In [19]:
data_final = pd.DataFrame()
delta_stage=5
datar = data[data['league_id']==1729]
datar = datar[datar['season']=='2012/2013']
datar = datar[datar['stage']>5]
datar = datar[datar['stage']<7]
#datar.apply(lambda row:tabela_mandante(row['stage'],row['stage']+delta_stage,row['league_id'],row['season'],row['home_team_api_id']))
for index, row in datar.iterrows():
if row['stage']>5:
#Pega os dados temporal e local do mandante
local_mandante = tabela_mandante(row['stage'],row['stage']+delta_stage,row['league_id'],row['season'],row['home_team_api_id'])
global_mandante = tabela(row['stage'],row['stage']+delta_stage,row['league_id'],row['season'],row['home_team_api_id'],'home')
mandante = pd.concat([local_mandante,global_mandante],axis=1)
mandante = mandante.drop('team',axis=1)
local_visitante = tabela_visitante(row['stage'],row['stage']+delta_stage,row['league_id'],row['season'],row['home_team_api_id'])
global_visitante = tabela(row['stage'],row['stage']+delta_stage,row['league_id'],row['season'],row['home_team_api_id'],'away')
visitante = pd.concat([local_visitante,global_visitante],axis=1)
visitante = visitante.drop('team',axis=1)
res = pd.concat([mandante,visitante],axis=1)
#Pegando os dados sobre estilo de cada equipe que o FIFA classifica
seasonI,seasonF=str(row['season']).split('/')
home_team_attributes = team_attribute(row['home_team_api_id'],seasonI,seasonF)
home_team_attributes.columns = ["h_"+ str(col) for col in home_team_attributes.columns]
away_team_attributes = team_attribute(row['away_team_api_id'],seasonI,seasonF)
away_team_attributes.columns = ["a_"+ str(col) for col in away_team_attributes.columns]
team_attributes = pd.concat([home_team_attributes,away_team_attributes],axis=1)
res = pd.concat([res,team_attributes],axis=1)
#pegando os dados altura e peso
home_hw = players_hw(row['home_player_1'],row['home_player_2'],row['home_player_3'],row['home_player_4'],row['home_player_5'],row['home_player_6'],row['home_player_7'],row['home_player_8'],row['home_player_9'],row['home_player_10'],row['home_player_11'])
away_hw = players_hw(row['away_player_1'],row['away_player_2'],row['away_player_3'],row['away_player_4'],row['away_player_5'],row['away_player_6'],row['away_player_7'],row['away_player_8'],row['away_player_9'],row['away_player_10'],row['away_player_11'])
hw = pd.concat([home_hw,away_hw],axis=1)
res = pd.concat([res,hw],axis=1)
#Junta os dados do jogo atual com os dados gerados pelas funções anteriores.
for field in row.index:
res[field] = row[field]
data_final = data_final.append(res)
print data_final.columns
In [ ]: