CS584: FIFA18 data Analysis

*Ting Jiang*
*Chen Gong*
*Yizhi Hong*

Part 1: Data pre-processing



In [1]:

    
import numpy as np
import sys
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
import glob
import warnings
warnings.filterwarnings("ignore")



In [2]:

    
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix, hstack, vstack
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
sns.set_style("dark")









    



/Users/Chi.Hong/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

look up data firt 10 columns



In [3]:

    
dataframe = pd.read_csv('../data/fifa-18-demo-player-dataset/CompleteDataset.csv')
dataframe.head(10)









    Out[3]:







  
    
      
      Unnamed: 0
      Name
      Age
      Photo
      Nationality
      Flag
      Overall
      Potential
      Club
      Club Logo
      ...
      RB
      RCB
      RCM
      RDM
      RF
      RM
      RS
      RW
      RWB
      ST
    
  
  
    
      0
      0
      Cristiano Ronaldo
      32
      https://cdn.sofifa.org/48/18/players/20801.png
      Portugal
      https://cdn.sofifa.org/flags/38.png
      94
      94
      Real Madrid CF
      https://cdn.sofifa.org/24/18/teams/243.png
      ...
      61.0
      53.0
      82.0
      62.0
      91.0
      89.0
      92.0
      91.0
      66.0
      92.0
    
    
      1
      1
      L. Messi
      30
      https://cdn.sofifa.org/48/18/players/158023.png
      Argentina
      https://cdn.sofifa.org/flags/52.png
      93
      93
      FC Barcelona
      https://cdn.sofifa.org/24/18/teams/241.png
      ...
      57.0
      45.0
      84.0
      59.0
      92.0
      90.0
      88.0
      91.0
      62.0
      88.0
    
    
      2
      2
      Neymar
      25
      https://cdn.sofifa.org/48/18/players/190871.png
      Brazil
      https://cdn.sofifa.org/flags/54.png
      92
      94
      Paris Saint-Germain
      https://cdn.sofifa.org/24/18/teams/73.png
      ...
      59.0
      46.0
      79.0
      59.0
      88.0
      87.0
      84.0
      89.0
      64.0
      84.0
    
    
      3
      3
      L. Suárez
      30
      https://cdn.sofifa.org/48/18/players/176580.png
      Uruguay
      https://cdn.sofifa.org/flags/60.png
      92
      92
      FC Barcelona
      https://cdn.sofifa.org/24/18/teams/241.png
      ...
      64.0
      58.0
      80.0
      65.0
      88.0
      85.0
      88.0
      87.0
      68.0
      88.0
    
    
      4
      4
      M. Neuer
      31
      https://cdn.sofifa.org/48/18/players/167495.png
      Germany
      https://cdn.sofifa.org/flags/21.png
      92
      92
      FC Bayern Munich
      https://cdn.sofifa.org/24/18/teams/21.png
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      5
      5
      R. Lewandowski
      28
      https://cdn.sofifa.org/48/18/players/188545.png
      Poland
      https://cdn.sofifa.org/flags/37.png
      91
      91
      FC Bayern Munich
      https://cdn.sofifa.org/24/18/teams/21.png
      ...
      58.0
      57.0
      78.0
      62.0
      87.0
      82.0
      88.0
      84.0
      61.0
      88.0
    
    
      6
      6
      De Gea
      26
      https://cdn.sofifa.org/48/18/players/193080.png
      Spain
      https://cdn.sofifa.org/flags/45.png
      90
      92
      Manchester United
      https://cdn.sofifa.org/24/18/teams/11.png
      ...
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
      NaN
    
    
      7
      7
      E. Hazard
      26
      https://cdn.sofifa.org/48/18/players/183277.png
      Belgium
      https://cdn.sofifa.org/flags/7.png
      90
      91
      Chelsea
      https://cdn.sofifa.org/24/18/teams/5.png
      ...
      59.0
      47.0
      81.0
      61.0
      87.0
      87.0
      82.0
      88.0
      64.0
      82.0
    
    
      8
      8
      T. Kroos
      27
      https://cdn.sofifa.org/48/18/players/182521.png
      Germany
      https://cdn.sofifa.org/flags/21.png
      90
      90
      Real Madrid CF
      https://cdn.sofifa.org/24/18/teams/243.png
      ...
      76.0
      72.0
      87.0
      82.0
      81.0
      81.0
      77.0
      80.0
      78.0
      77.0
    
    
      9
      9
      G. Higuaín
      29
      https://cdn.sofifa.org/48/18/players/167664.png
      Argentina
      https://cdn.sofifa.org/flags/52.png
      90
      90
      Juventus
      https://cdn.sofifa.org/24/18/teams/45.png
      ...
      51.0
      46.0
      71.0
      52.0
      84.0
      79.0
      87.0
      82.0
      55.0
      87.0
    
  

10 rows × 75 columns



In [4]:

    
dataframe.columns









    Out[4]:





Index(['Unnamed: 0', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall',
       'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
       'Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control',
       'Composure', 'Crossing', 'Curve', 'Dribbling', 'Finishing',
       'Free kick accuracy', 'GK diving', 'GK handling', 'GK kicking',
       'GK positioning', 'GK reflexes', 'Heading accuracy', 'Interceptions',
       'Jumping', 'Long passing', 'Long shots', 'Marking', 'Penalties',
       'Positioning', 'Reactions', 'Short passing', 'Shot power',
       'Sliding tackle', 'Sprint speed', 'Stamina', 'Standing tackle',
       'Strength', 'Vision', 'Volleys', 'CAM', 'CB', 'CDM', 'CF', 'CM', 'ID',
       'LAM', 'LB', 'LCB', 'LCM', 'LDM', 'LF', 'LM', 'LS', 'LW', 'LWB',
       'Preferred Positions', 'RAM', 'RB', 'RCB', 'RCM', 'RDM', 'RF', 'RM',
       'RS', 'RW', 'RWB', 'ST'],
      dtype='object')

take the attribute that we need to use

The attribute needs to be predicted: 'Overall','Preferred Positions'

The attribute use to predict: rest of the attributes



In [5]:

    
# only consider non goalkeeper's position.

col_needed = ['Overall','Acceleration', 'Aggression', 'Agility', 'Balance', 'Ball control',
       'Composure', 'Crossing', 'Curve', 'Dribbling', 'Finishing',
       'Free kick accuracy', 'Heading accuracy', 'Interceptions',
       'Jumping', 'Long passing', 'Long shots', 'Marking', 'Penalties',
       'Positioning', 'Reactions', 'Short passing', 'Shot power',
       'Sliding tackle', 'Sprint speed', 'Stamina', 'Standing tackle',
       'Strength', 'Vision', 'Volleys', 'Preferred Positions']

# rearrange the attributes. The attribute need to be predicted: Overall, Preferred Position
# rearrange as ST -> CM -> CB

recol_needed = ['Overall','Finishing', 'Shot power', 'Positioning', 'Dribbling', 'Long shots','Penalties', 'Volleys', 
                 'Acceleration', 'Agility','Sprint speed', 'Curve',
                
       'Free kick accuracy', 'Heading accuracy', 
       'Short passing', 'Long passing', 'Vision',
       'Strength', 'Stamina', 'Balance', 'Ball control','Composure','Jumping', 
       'Crossing','Reactions',
       'Aggression','Interceptions', 'Marking', 'Sliding tackle', 'Standing tackle','Preferred Positions']

dataframe = dataframe[recol_needed]
dataframe.head(10)









    Out[5]:







  
    
      
      Overall
      Finishing
      Shot power
      Positioning
      Dribbling
      Long shots
      Penalties
      Volleys
      Acceleration
      Agility
      ...
      Composure
      Jumping
      Crossing
      Reactions
      Aggression
      Interceptions
      Marking
      Sliding tackle
      Standing tackle
      Preferred Positions
    
  
  
    
      0
      94
      94
      94
      95
      91
      92
      85
      88
      89
      89
      ...
      95
      95
      85
      96
      63
      29
      22
      23
      31
      ST LW
    
    
      1
      93
      95
      85
      93
      97
      88
      74
      85
      92
      90
      ...
      96
      68
      77
      95
      48
      22
      13
      26
      28
      RW
    
    
      2
      92
      89
      80
      90
      96
      77
      81
      83
      94
      96
      ...
      92
      61
      75
      88
      56
      36
      21
      33
      24
      LW
    
    
      3
      92
      94
      87
      92
      86
      86
      85
      88
      88
      86
      ...
      83
      69
      77
      93
      78
      41
      30
      38
      45
      ST
    
    
      4
      92
      13
      25
      12
      30
      16
      47
      11
      58
      52
      ...
      70
      78
      15
      85
      29
      30
      10
      11
      10
      GK
    
    
      5
      91
      91
      88
      91
      85
      83
      81
      87
      79
      78
      ...
      87
      84
      62
      91
      80
      39
      25
      19
      42
      ST
    
    
      6
      90
      13
      31
      12
      18
      12
      40
      13
      57
      60
      ...
      64
      67
      17
      88
      38
      30
      13
      13
      21
      GK
    
    
      7
      90
      83
      79
      85
      93
      82
      86
      79
      93
      93
      ...
      87
      59
      80
      85
      54
      41
      25
      22
      27
      LW
    
    
      8
      90
      76
      87
      79
      79
      90
      73
      82
      60
      71
      ...
      85
      32
      85
      86
      60
      85
      63
      69
      82
      CDM CM
    
    
      9
      90
      91
      88
      92
      84
      82
      70
      88
      78
      75
      ...
      86
      79
      68
      88
      50
      20
      12
      18
      22
      ST
    
  

10 rows × 31 columns



In [6]:

    
dataframe['Preferred Positions'] = dataframe['Preferred Positions'].str.strip()
#remove Goalkeeper from dataframe

dataframe = dataframe[dataframe['Preferred Positions'] != 'GK']
dataframe.head(10)









    Out[6]:







  
    
      
      Overall
      Finishing
      Shot power
      Positioning
      Dribbling
      Long shots
      Penalties
      Volleys
      Acceleration
      Agility
      ...
      Composure
      Jumping
      Crossing
      Reactions
      Aggression
      Interceptions
      Marking
      Sliding tackle
      Standing tackle
      Preferred Positions
    
  
  
    
      0
      94
      94
      94
      95
      91
      92
      85
      88
      89
      89
      ...
      95
      95
      85
      96
      63
      29
      22
      23
      31
      ST LW
    
    
      1
      93
      95
      85
      93
      97
      88
      74
      85
      92
      90
      ...
      96
      68
      77
      95
      48
      22
      13
      26
      28
      RW
    
    
      2
      92
      89
      80
      90
      96
      77
      81
      83
      94
      96
      ...
      92
      61
      75
      88
      56
      36
      21
      33
      24
      LW
    
    
      3
      92
      94
      87
      92
      86
      86
      85
      88
      88
      86
      ...
      83
      69
      77
      93
      78
      41
      30
      38
      45
      ST
    
    
      5
      91
      91
      88
      91
      85
      83
      81
      87
      79
      78
      ...
      87
      84
      62
      91
      80
      39
      25
      19
      42
      ST
    
    
      7
      90
      83
      79
      85
      93
      82
      86
      79
      93
      93
      ...
      87
      59
      80
      85
      54
      41
      25
      22
      27
      LW
    
    
      8
      90
      76
      87
      79
      79
      90
      73
      82
      60
      71
      ...
      85
      32
      85
      86
      60
      85
      63
      69
      82
      CDM CM
    
    
      9
      90
      91
      88
      92
      84
      82
      70
      88
      78
      75
      ...
      86
      79
      68
      88
      50
      20
      12
      18
      22
      ST
    
    
      10
      90
      60
      79
      52
      61
      55
      68
      66
      75
      79
      ...
      80
      93
      66
      85
      84
      88
      86
      91
      89
      CB
    
    
      11
      89
      83
      85
      84
      85
      86
      77
      82
      76
      80
      ...
      84
      65
      90
      88
      68
      56
      30
      40
      51
      RM CM CAM
    
  

10 rows × 31 columns

Check the data



In [7]:

    
# make sure no null value.
dataframe.isnull().values.any()









    Out[7]:





False



In [8]:

    
# Check all the positions we have.
positions = dataframe['Preferred Positions'].str.split().apply(lambda x: x[0]).unique()
positions









    Out[8]:





array(['ST', 'RW', 'LW', 'CDM', 'CB', 'RM', 'CM', 'LM', 'LB', 'CAM', 'RB',
       'CF', 'RWB', 'LWB'], dtype=object)



In [9]:

    
# handle multiple positions
df_fifa = dataframe.copy()
df_fifa.drop(df_fifa.index, inplace=True)

for position in positions:
    temp = dataframe[dataframe['Preferred Positions'].str.contains(position)]
    temp['Preferred Positions'] = position
    df_fifa = df_fifa.append(temp, ignore_index=True)
    
df_fifa.iloc[::1000, :]









    Out[9]:







  
    
      
      Overall
      Finishing
      Shot power
      Positioning
      Dribbling
      Long shots
      Penalties
      Volleys
      Acceleration
      Agility
      ...
      Composure
      Jumping
      Crossing
      Reactions
      Aggression
      Interceptions
      Marking
      Sliding tackle
      Standing tackle
      Preferred Positions
    
  
  
    
      0
      94
      94
      94
      95
      91
      92
      85
      88
      89
      89
      ...
      95
      95
      85
      96
      63
      29
      22
      23
      31
      ST
    
    
      1000
      70
      68
      66
      75
      67
      63
      67
      63
      63
      68
      ...
      67
      72
      62
      68
      70
      56
      21
      25
      31
      ST
    
    
      2000
      64
      53
      50
      61
      67
      42
      51
      48
      86
      81
      ...
      44
      60
      59
      58
      32
      17
      23
      26
      24
      ST
    
    
      3000
      56
      61
      53
      49
      48
      49
      53
      47
      56
      58
      ...
      49
      78
      31
      53
      42
      28
      22
      19
      24
      ST
    
    
      4000
      62
      57
      59
      60
      61
      52
      57
      45
      79
      70
      ...
      49
      54
      59
      65
      41
      26
      22
      20
      19
      RW
    
    
      5000
      61
      46
      70
      72
      60
      54
      63
      67
      76
      74
      ...
      58
      51
      47
      58
      34
      13
      17
      17
      17
      LW
    
    
      6000
      71
      57-3
      73-5
      63-3
      67-1
      68-4
      69
      49
      45
      68
      ...
      77
      73
      61-3
      72
      68
      69-4
      68
      66-3
      69-2
      CDM
    
    
      7000
      65
      60
      55
      66
      63
      55
      65
      51
      66
      61
      ...
      57
      66
      66
      60
      61
      55
      45
      58
      64
      CDM
    
    
      8000
      78
      65
      77
      59
      64
      64
      50
      56
      58
      52
      ...
      67
      74
      53
      72
      77
      78
      75
      76
      81
      CB
    
    
      9000
      70
      22
      62
      44
      58
      44
      62
      21
      51
      39
      ...
      61
      57
      57
      68
      73
      72
      72
      65
      70
      CB
    
    
      10000
      65
      19
      40
      22
      32
      13
      17
      18
      53
      60
      ...
      56
      80
      34
      54
      60
      57
      62
      66
      68
      CB
    
    
      11000
      59
      19
      38
      29
      50
      22
      32
      26
      68
      58
      ...
      55
      64
      45
      56
      56
      59
      56
      56
      59
      CB
    
    
      12000
      72
      65
      63
      53
      79
      63
      75
      68
      77
      76
      ...
      69
      60
      64
      71
      74
      28
      22
      17
      18
      RM
    
    
      13000
      66
      61
      68
      58
      72
      65
      59
      58
      77
      86
      ...
      67
      46
      58
      55
      48
      37
      40
      26
      34
      RM
    
    
      14000
      50
      35
      53
      52
      45
      32
      44
      37
      69
      62
      ...
      46
      63
      51
      51
      48
      39
      39
      45
      47
      RM
    
    
      15000
      71
      53
      56+2
      68
      73
      55+5
      45
      46
      63
      75
      ...
      72
      53
      61
      70
      61
      60
      59
      43
      64
      CM
    
    
      16000
      66
      52
      68
      66
      69
      61
      49
      59
      78
      79
      ...
      60
      59
      65
      64
      67
      52
      47
      52
      53
      CM
    
    
      17000
      60
      32
      60
      50
      56
      51
      47
      34
      69
      56
      ...
      51
      60
      48
      48
      64
      60
      58
      50
      61
      CM
    
    
      18000
      74
      67
      76
      68
      74
      83
      48
      64
      85
      80
      ...
      74
      56
      72
      69
      41
      68
      72
      76
      72
      LM
    
    
      19000
      67
      38
      40
      67
      67
      40
      35
      21
      78
      76
      ...
      61
      60
      66
      66
      68
      59
      61
      62
      63
      LM
    
    
      20000
      57
      39
      43
      50
      58
      47
      39
      44
      71
      72
      ...
      58
      78
      47
      53
      33
      43
      42
      41
      39
      LM
    
    
      21000
      67
      43
      58
      40
      63
      60
      55
      39
      72
      59
      ...
      59
      64
      54
      62
      65
      68
      65
      60
      66
      LB
    
    
      22000
      58
      23
      23
      45
      52
      24
      38
      26
      77
      73
      ...
      33
      59
      54
      54
      60
      54
      55
      55
      59
      LB
    
    
      23000
      70
      57
      61
      65
      68
      58
      61
      53
      78
      80
      ...
      60
      55
      67
      62
      52
      36
      45
      47
      46
      CAM
    
    
      24000
      62
      50
      56
      54
      57
      56
      55
      49
      59
      81
      ...
      58
      73
      57
      58
      58
      52
      47
      45
      57
      CAM
    
    
      25000
      70
      53
      64
      70
      69
      61
      51
      33
      78
      67
      ...
      65
      81
      71
      63
      76
      66
      64
      70
      67
      RB
    
    
      26000
      62
      22
      38
      41
      43
      23
      39
      29
      44
      62
      ...
      45
      75
      27
      54
      53
      56
      65
      62
      58
      RB
    
    
      27000
      64
      36
      35
      38
      62
      33
      46
      34
      76
      60
      ...
      54
      70
      41
      59
      57
      64
      70
      72
      73
      RWB
    
  

28 rows × 31 columns



In [10]:

    
cols = [col for col in df_fifa.columns if col not in ['Preferred Positions']]

for i in cols:
    df_fifa[i] = df_fifa[i].apply(lambda x: eval(x) if isinstance(x,str) else x)

df_fifa.iloc[::1000, :]









    Out[10]:







  
    
      
      Overall
      Finishing
      Shot power
      Positioning
      Dribbling
      Long shots
      Penalties
      Volleys
      Acceleration
      Agility
      ...
      Composure
      Jumping
      Crossing
      Reactions
      Aggression
      Interceptions
      Marking
      Sliding tackle
      Standing tackle
      Preferred Positions
    
  
  
    
      0
      94
      94
      94
      95
      91
      92
      85
      88
      89
      89
      ...
      95
      95
      85
      96
      63
      29
      22
      23
      31
      ST
    
    
      1000
      70
      68
      66
      75
      67
      63
      67
      63
      63
      68
      ...
      67
      72
      62
      68
      70
      56
      21
      25
      31
      ST
    
    
      2000
      64
      53
      50
      61
      67
      42
      51
      48
      86
      81
      ...
      44
      60
      59
      58
      32
      17
      23
      26
      24
      ST
    
    
      3000
      56
      61
      53
      49
      48
      49
      53
      47
      56
      58
      ...
      49
      78
      31
      53
      42
      28
      22
      19
      24
      ST
    
    
      4000
      62
      57
      59
      60
      61
      52
      57
      45
      79
      70
      ...
      49
      54
      59
      65
      41
      26
      22
      20
      19
      RW
    
    
      5000
      61
      46
      70
      72
      60
      54
      63
      67
      76
      74
      ...
      58
      51
      47
      58
      34
      13
      17
      17
      17
      LW
    
    
      6000
      71
      54
      68
      60
      66
      64
      69
      49
      45
      68
      ...
      77
      73
      58
      72
      68
      65
      68
      63
      67
      CDM
    
    
      7000
      65
      60
      55
      66
      63
      55
      65
      51
      66
      61
      ...
      57
      66
      66
      60
      61
      55
      45
      58
      64
      CDM
    
    
      8000
      78
      65
      77
      59
      64
      64
      50
      56
      58
      52
      ...
      67
      74
      53
      72
      77
      78
      75
      76
      81
      CB
    
    
      9000
      70
      22
      62
      44
      58
      44
      62
      21
      51
      39
      ...
      61
      57
      57
      68
      73
      72
      72
      65
      70
      CB
    
    
      10000
      65
      19
      40
      22
      32
      13
      17
      18
      53
      60
      ...
      56
      80
      34
      54
      60
      57
      62
      66
      68
      CB
    
    
      11000
      59
      19
      38
      29
      50
      22
      32
      26
      68
      58
      ...
      55
      64
      45
      56
      56
      59
      56
      56
      59
      CB
    
    
      12000
      72
      65
      63
      53
      79
      63
      75
      68
      77
      76
      ...
      69
      60
      64
      71
      74
      28
      22
      17
      18
      RM
    
    
      13000
      66
      61
      68
      58
      72
      65
      59
      58
      77
      86
      ...
      67
      46
      58
      55
      48
      37
      40
      26
      34
      RM
    
    
      14000
      50
      35
      53
      52
      45
      32
      44
      37
      69
      62
      ...
      46
      63
      51
      51
      48
      39
      39
      45
      47
      RM
    
    
      15000
      71
      53
      58
      68
      73
      60
      45
      46
      63
      75
      ...
      72
      53
      61
      70
      61
      60
      59
      43
      64
      CM
    
    
      16000
      66
      52
      68
      66
      69
      61
      49
      59
      78
      79
      ...
      60
      59
      65
      64
      67
      52
      47
      52
      53
      CM
    
    
      17000
      60
      32
      60
      50
      56
      51
      47
      34
      69
      56
      ...
      51
      60
      48
      48
      64
      60
      58
      50
      61
      CM
    
    
      18000
      74
      67
      76
      68
      74
      83
      48
      64
      85
      80
      ...
      74
      56
      72
      69
      41
      68
      72
      76
      72
      LM
    
    
      19000
      67
      38
      40
      67
      67
      40
      35
      21
      78
      76
      ...
      61
      60
      66
      66
      68
      59
      61
      62
      63
      LM
    
    
      20000
      57
      39
      43
      50
      58
      47
      39
      44
      71
      72
      ...
      58
      78
      47
      53
      33
      43
      42
      41
      39
      LM
    
    
      21000
      67
      43
      58
      40
      63
      60
      55
      39
      72
      59
      ...
      59
      64
      54
      62
      65
      68
      65
      60
      66
      LB
    
    
      22000
      58
      23
      23
      45
      52
      24
      38
      26
      77
      73
      ...
      33
      59
      54
      54
      60
      54
      55
      55
      59
      LB
    
    
      23000
      70
      57
      61
      65
      68
      58
      61
      53
      78
      80
      ...
      60
      55
      67
      62
      52
      36
      45
      47
      46
      CAM
    
    
      24000
      62
      50
      56
      54
      57
      56
      55
      49
      59
      81
      ...
      58
      73
      57
      58
      58
      52
      47
      45
      57
      CAM
    
    
      25000
      70
      53
      64
      70
      69
      61
      51
      33
      78
      67
      ...
      65
      81
      71
      63
      76
      66
      64
      70
      67
      RB
    
    
      26000
      62
      22
      38
      41
      43
      23
      39
      29
      44
      62
      ...
      45
      75
      27
      54
      53
      56
      65
      62
      58
      RB
    
    
      27000
      64
      36
      35
      38
      62
      33
      46
      34
      76
      60
      ...
      54
      70
      41
      59
      57
      64
      70
      72
      73
      RWB
    
  

28 rows × 31 columns

Part 2: Data Analyze

The plot below shows how the attributes contribute the position.



In [11]:

    
fig, fs = plt.subplots()

## show the 3 main positions  
df_ST = df_fifa[df_fifa['Preferred Positions'] == 'ST'].iloc[::10,:-1]
np.mean(df_ST).T.plot.line(color = 'red', figsize = (15,10), legend = 'ST',label='ST', ylim = (0, 110), title = "attributes distribution", ax=fs)

df_CM = df_fifa[df_fifa['Preferred Positions'] == 'CM'].iloc[::10,:-1]
np.mean(df_CM).T.plot.line(color = 'blue', figsize = (15,10), legend = 'CM',label='CM', ylim = (0, 110), title = "attributes distribution", ax=fs)

df_CB = df_fifa[df_fifa['Preferred Positions'] == 'CB'].iloc[::10,:-1]
np.mean(df_CB).T.plot.line(color = 'green', figsize = (15,10), legend = 'CB',label='CB', ylim = (0, 110), title = "attributes distribution", ax=fs)



fs.set_xlabel('Attributes')
fs.set_ylabel('Rating')

fs.set_xticks(np.arange(len(cols)))
fs.set_xticklabels(labels = cols, rotation=90)

for l in fs.lines:
    l.set_linewidth(1)

fs.axvline(0, color='red', linestyle='--')   
fs.axvline(12, color='red', linestyle='--')

fs.axvline(12.1, color='blue', linestyle='--')
fs.axvline(24, color='blue', linestyle='--')

fs.axvline(24.1, color='green', linestyle='--')
fs.axvline(29, color='green', linestyle='--')

fs.text(4, 85, 'Attack Attributes', color = 'red', weight = 'bold')
fs.text(15.5, 85, 'Mixed Attributes', color = 'blue', weight = 'bold')
fs.text(25, 85, 'Defend Attributes', color = 'green', weight = 'bold')
plt.show()

we can see above there is obvious margin between attacker's attributes and defender's attributes

1. Logistic Regression

* predict the Attacker or the Defender *

Set the ST/RW/LW/RM/CM/LM/CAM/CF as an Attacker group --> 1

Set the CDM/CB/LB/RB/RWB/LWB as an Defender group --> 0



In [12]:

    
# Set the baseline of the prediction
baseline = 1/2
print('The baseline is', baseline)









    



The baseline is 0.5



In [13]:

    
df_fifa_normalized = df_fifa.iloc[:,:-1].div(df_fifa.iloc[:,:-1].sum(axis=1), axis=0)
mapping = {'ST': 1, 'RW': 1, 'LW': 1, 'RM': 1, 'CM': 1, 'LM': 1, 'CAM': 1, 'CF': 1, 'CDM': 0, 'CB': 0, 'LB': 0, 'RB': 0, 'RWB': 0, 'LWB': 0}

df_fifa_normalized['Preferred Positions'] = df_fifa['Preferred Positions']

df_fifa_normalized = df_fifa_normalized.replace({'Preferred Positions': mapping})
df_fifa_normalized.iloc[::1000,]









    Out[13]:







  
    
      
      Overall
      Finishing
      Shot power
      Positioning
      Dribbling
      Long shots
      Penalties
      Volleys
      Acceleration
      Agility
      ...
      Composure
      Jumping
      Crossing
      Reactions
      Aggression
      Interceptions
      Marking
      Sliding tackle
      Standing tackle
      Preferred Positions
    
  
  
    
      0
      0.039847
      0.039847
      0.039847
      0.040271
      0.038576
      0.039000
      0.036032
      0.037304
      0.037728
      0.037728
      ...
      0.040271
      0.040271
      0.036032
      0.040695
      0.026706
      0.012293
      0.009326
      0.009750
      0.013141
      1
    
    
      1000
      0.037797
      0.036717
      0.035637
      0.040497
      0.036177
      0.034017
      0.036177
      0.034017
      0.034017
      0.036717
      ...
      0.036177
      0.038877
      0.033477
      0.036717
      0.037797
      0.030238
      0.011339
      0.013499
      0.016739
      1
    
    
      2000
      0.040842
      0.033823
      0.031908
      0.038928
      0.042757
      0.026803
      0.032546
      0.030632
      0.054882
      0.051691
      ...
      0.028079
      0.038290
      0.037652
      0.037013
      0.020421
      0.010849
      0.014678
      0.016592
      0.015316
      1
    
    
      3000
      0.039773
      0.043324
      0.037642
      0.034801
      0.034091
      0.034801
      0.037642
      0.033381
      0.039773
      0.041193
      ...
      0.034801
      0.055398
      0.022017
      0.037642
      0.029830
      0.019886
      0.015625
      0.013494
      0.017045
      1
    
    
      4000
      0.039241
      0.036076
      0.037342
      0.037975
      0.038608
      0.032911
      0.036076
      0.028481
      0.050000
      0.044304
      ...
      0.031013
      0.034177
      0.037342
      0.041139
      0.025949
      0.016456
      0.013924
      0.012658
      0.012025
      1
    
    
      5000
      0.038293
      0.028876
      0.043942
      0.045198
      0.037665
      0.033898
      0.039548
      0.042059
      0.047709
      0.046453
      ...
      0.036409
      0.032015
      0.029504
      0.036409
      0.021343
      0.008161
      0.010672
      0.010672
      0.010672
      1
    
    
      6000
      0.036960
      0.028110
      0.035398
      0.031234
      0.034357
      0.033316
      0.035919
      0.025508
      0.023425
      0.035398
      ...
      0.040083
      0.038001
      0.030193
      0.037480
      0.035398
      0.033837
      0.035398
      0.032795
      0.034878
      0
    
    
      7000
      0.036171
      0.033389
      0.030607
      0.036728
      0.035058
      0.030607
      0.036171
      0.028381
      0.036728
      0.033945
      ...
      0.031720
      0.036728
      0.036728
      0.033389
      0.033945
      0.030607
      0.025042
      0.032276
      0.035615
      0
    
    
      8000
      0.038825
      0.032354
      0.038328
      0.029368
      0.031857
      0.031857
      0.024888
      0.027875
      0.028870
      0.025884
      ...
      0.033350
      0.036834
      0.026381
      0.035839
      0.038328
      0.038825
      0.037332
      0.037830
      0.040319
      0
    
    
      9000
      0.042735
      0.013431
      0.037851
      0.026862
      0.035409
      0.026862
      0.037851
      0.012821
      0.031136
      0.023810
      ...
      0.037241
      0.034799
      0.034799
      0.041514
      0.044567
      0.043956
      0.043956
      0.039683
      0.042735
      0
    
    
      10000
      0.044338
      0.012960
      0.027285
      0.015007
      0.021828
      0.008868
      0.011596
      0.012278
      0.036153
      0.040928
      ...
      0.038199
      0.054570
      0.023192
      0.036835
      0.040928
      0.038881
      0.042292
      0.045020
      0.046385
      0
    
    
      11000
      0.039651
      0.012769
      0.025538
      0.019489
      0.033602
      0.014785
      0.021505
      0.017473
      0.045699
      0.038978
      ...
      0.036962
      0.043011
      0.030242
      0.037634
      0.037634
      0.039651
      0.037634
      0.037634
      0.039651
      0
    
    
      12000
      0.038585
      0.034834
      0.033762
      0.028403
      0.042337
      0.033762
      0.040193
      0.036442
      0.041265
      0.040729
      ...
      0.036977
      0.032154
      0.034298
      0.038049
      0.039657
      0.015005
      0.011790
      0.009110
      0.009646
      1
    
    
      13000
      0.037671
      0.034817
      0.038813
      0.033105
      0.041096
      0.037100
      0.033676
      0.033105
      0.043950
      0.049087
      ...
      0.038242
      0.026256
      0.033105
      0.031393
      0.027397
      0.021119
      0.022831
      0.014840
      0.019406
      1
    
    
      14000
      0.034530
      0.024171
      0.036602
      0.035912
      0.031077
      0.022099
      0.030387
      0.025552
      0.047652
      0.042818
      ...
      0.031768
      0.043508
      0.035221
      0.035221
      0.033149
      0.026934
      0.026934
      0.031077
      0.032459
      1
    
    
      15000
      0.038213
      0.028525
      0.031216
      0.036598
      0.039290
      0.032293
      0.024220
      0.024758
      0.033907
      0.040366
      ...
      0.038751
      0.028525
      0.032831
      0.037675
      0.032831
      0.032293
      0.031755
      0.023143
      0.034446
      1
    
    
      16000
      0.035503
      0.027972
      0.036579
      0.035503
      0.037117
      0.032813
      0.026358
      0.031737
      0.041958
      0.042496
      ...
      0.032275
      0.031737
      0.034965
      0.034427
      0.036041
      0.027972
      0.025282
      0.027972
      0.028510
      1
    
    
      17000
      0.036496
      0.019465
      0.036496
      0.030414
      0.034063
      0.031022
      0.028589
      0.020681
      0.041971
      0.034063
      ...
      0.031022
      0.036496
      0.029197
      0.029197
      0.038929
      0.036496
      0.035280
      0.030414
      0.037105
      1
    
    
      18000
      0.036010
      0.032603
      0.036983
      0.033090
      0.036010
      0.040389
      0.023358
      0.031144
      0.041363
      0.038929
      ...
      0.036010
      0.027251
      0.035036
      0.033577
      0.019951
      0.033090
      0.035036
      0.036983
      0.035036
      1
    
    
      19000
      0.038373
      0.021764
      0.022910
      0.038373
      0.038373
      0.022910
      0.020046
      0.012027
      0.044674
      0.043528
      ...
      0.034937
      0.034364
      0.037801
      0.037801
      0.038946
      0.033792
      0.034937
      0.035510
      0.036082
      1
    
    
      20000
      0.036822
      0.025194
      0.027778
      0.032300
      0.037468
      0.030362
      0.025194
      0.028424
      0.045866
      0.046512
      ...
      0.037468
      0.050388
      0.030362
      0.034238
      0.021318
      0.027778
      0.027132
      0.026486
      0.025194
      1
    
    
      21000
      0.037160
      0.023849
      0.032169
      0.022185
      0.034942
      0.033278
      0.030505
      0.021631
      0.039933
      0.032723
      ...
      0.032723
      0.035496
      0.029950
      0.034387
      0.036051
      0.037715
      0.036051
      0.033278
      0.036606
      0
    
    
      22000
      0.039510
      0.015668
      0.015668
      0.030654
      0.035422
      0.016349
      0.025886
      0.017711
      0.052452
      0.049728
      ...
      0.022480
      0.040191
      0.036785
      0.036785
      0.040872
      0.036785
      0.037466
      0.037466
      0.040191
      0
    
    
      23000
      0.037413
      0.030465
      0.032603
      0.034741
      0.036344
      0.030999
      0.032603
      0.028327
      0.041689
      0.042758
      ...
      0.032068
      0.029396
      0.035810
      0.033137
      0.027793
      0.019241
      0.024051
      0.025120
      0.024586
      1
    
    
      24000
      0.034714
      0.027996
      0.031355
      0.030235
      0.031915
      0.031355
      0.030795
      0.027436
      0.033035
      0.045353
      ...
      0.032475
      0.040873
      0.031915
      0.032475
      0.032475
      0.029115
      0.026316
      0.025196
      0.031915
      1
    
    
      25000
      0.035678
      0.027013
      0.032620
      0.035678
      0.035168
      0.031091
      0.025994
      0.016820
      0.039755
      0.034149
      ...
      0.033129
      0.041284
      0.036188
      0.032110
      0.038736
      0.033639
      0.032620
      0.035678
      0.034149
      0
    
    
      26000
      0.042234
      0.014986
      0.025886
      0.027929
      0.029292
      0.015668
      0.026567
      0.019755
      0.029973
      0.042234
      ...
      0.030654
      0.051090
      0.018392
      0.036785
      0.036104
      0.038147
      0.044278
      0.042234
      0.039510
      0
    
    
      27000
      0.039192
      0.022045
      0.021433
      0.023270
      0.037967
      0.020208
      0.028169
      0.020821
      0.046540
      0.036742
      ...
      0.033068
      0.042866
      0.025107
      0.036130
      0.034905
      0.039192
      0.042866
      0.044091
      0.044703
      0
    
  

28 rows × 31 columns



In [34]:

    
# perform 5 cross validation
clf = LogisticRegression()
x = df_fifa_normalized.iloc[:,:-1]
y = df_fifa_normalized.iloc[:,-1]
scores = cross_val_score(clf, x, y, cv=5)
print ('Logistic Regression Accuracy: {}'.format(np.mean(scores)))









    



Logistic Regression Accuracy: 0.8247060870911659

Tune the features by lasso



In [35]:

    
#Perform lasso to get rid of the attribute that unnecessary influence the decision of position
clf = Lasso(alpha=0.00001)
clf.fit(x,y)
Feature_Coef_list = list(sorted(zip(recol_needed, abs(clf.coef_)),key=lambda x: -x[1]))
Feature_Coef_table = pd.DataFrame(np.array(Feature_Coef_list).reshape(-1,2), columns = ['Attributes', 'Coefficient'])
print(Feature_Coef_table)









    



            Attributes      Coefficient
0              Marking    12.8268120885
1            Finishing    9.78612494569
2               Vision    6.79233236356
3       Sliding tackle    5.24005141908
4        Interceptions     4.6865797977
5     Heading accuracy    3.95074704299
6             Strength    1.94418902492
7              Jumping    1.28531447256
8          Positioning   0.790548278525
9              Volleys   0.396498035159
10     Standing tackle  0.0286348328445
11             Overall              0.0
12          Shot power              0.0
13           Dribbling              0.0
14          Long shots              0.0
15           Penalties              0.0
16        Acceleration              0.0
17             Agility              0.0
18        Sprint speed              0.0
19               Curve              0.0
20  Free kick accuracy              0.0
21       Short passing              0.0
22        Long passing              0.0
23             Stamina              0.0
24             Balance              0.0
25        Ball control              0.0
26           Composure              0.0
27            Crossing              0.0
28           Reactions              0.0
29          Aggression              0.0

now we try to enumerate the features to get the highest performance



In [36]:

    
max_score = 0
n_features = 0

for i in range(1,len(Feature_Coef_table['Attributes'])):
    clf_lasso = LogisticRegression()
    lasso_cols = Feature_Coef_table[:i]['Attributes'].tolist()
    x_lasso = df_fifa_normalized.iloc[:,:-1][lasso_cols]
    scores_lasso = cross_val_score(clf_lasso, x_lasso,y , cv=5)
    if np.mean(scores_lasso) > max_score:
        max_score = np.mean(scores_lasso)
        n_features = i

print ('Logistic Regression Accuracy (' + str(n_features) +' features):' + str(max_score))









    



Logistic Regression Accuracy (5 features):0.836595975151

As we can see here. we are improve the accuracy slightly

And it is higher than baseline 0.5



In [17]:

    
imp_features = Feature_Coef_table[:n_features]['Attributes'].tolist()
print('The important features to determine the 1/0 is')
print(imp_features)









    



The important features to determine the 1/0 is
['Marking', 'Finishing', 'Vision', 'Overall', 'Interceptions', 'Crossing', 'Sliding tackle']

2. Random Forest

* predict all the position *



In [18]:

    
# Set the baseline of the prediction
baseline = 1/len(positions)
print('The baseline is', baseline)









    



The baseline is 0.07142857142857142



In [19]:

    
df_fifa_all_pos = df_fifa.copy()
mapping_all = {'ST': 0, 'RW': 1, 'LW': 2, 'RM': 3, 'CM': 4, 'LM': 5, 'CAM': 6, 'CF': 7, 'CDM': 8, 'CB': 9, 'LB': 10, 'RB': 11, 'RWB': 12, 'LWB': 13}

df_fifa_all_pos = df_fifa_all_pos.replace({'Preferred Positions': mapping_all})
df_fifa_all_pos.iloc[::1000,]









    Out[19]:







  
    
      
      Overall
      Finishing
      Shot power
      Positioning
      Dribbling
      Long shots
      Penalties
      Volleys
      Acceleration
      Agility
      ...
      Composure
      Jumping
      Crossing
      Reactions
      Aggression
      Interceptions
      Marking
      Sliding tackle
      Standing tackle
      Preferred Positions
    
  
  
    
      0
      94
      94
      94
      95
      91
      92
      85
      88
      89
      89
      ...
      95
      95
      85
      96
      63
      29
      22
      23
      31
      0
    
    
      1000
      70
      68
      66
      75
      67
      63
      67
      63
      63
      68
      ...
      67
      72
      62
      68
      70
      56
      21
      25
      31
      0
    
    
      2000
      64
      53
      50
      61
      67
      42
      51
      48
      86
      81
      ...
      44
      60
      59
      58
      32
      17
      23
      26
      24
      0
    
    
      3000
      56
      61
      53
      49
      48
      49
      53
      47
      56
      58
      ...
      49
      78
      31
      53
      42
      28
      22
      19
      24
      0
    
    
      4000
      62
      57
      59
      60
      61
      52
      57
      45
      79
      70
      ...
      49
      54
      59
      65
      41
      26
      22
      20
      19
      1
    
    
      5000
      61
      46
      70
      72
      60
      54
      63
      67
      76
      74
      ...
      58
      51
      47
      58
      34
      13
      17
      17
      17
      2
    
    
      6000
      71
      54
      68
      60
      66
      64
      69
      49
      45
      68
      ...
      77
      73
      58
      72
      68
      65
      68
      63
      67
      8
    
    
      7000
      65
      60
      55
      66
      63
      55
      65
      51
      66
      61
      ...
      57
      66
      66
      60
      61
      55
      45
      58
      64
      8
    
    
      8000
      78
      65
      77
      59
      64
      64
      50
      56
      58
      52
      ...
      67
      74
      53
      72
      77
      78
      75
      76
      81
      9
    
    
      9000
      70
      22
      62
      44
      58
      44
      62
      21
      51
      39
      ...
      61
      57
      57
      68
      73
      72
      72
      65
      70
      9
    
    
      10000
      65
      19
      40
      22
      32
      13
      17
      18
      53
      60
      ...
      56
      80
      34
      54
      60
      57
      62
      66
      68
      9
    
    
      11000
      59
      19
      38
      29
      50
      22
      32
      26
      68
      58
      ...
      55
      64
      45
      56
      56
      59
      56
      56
      59
      9
    
    
      12000
      72
      65
      63
      53
      79
      63
      75
      68
      77
      76
      ...
      69
      60
      64
      71
      74
      28
      22
      17
      18
      3
    
    
      13000
      66
      61
      68
      58
      72
      65
      59
      58
      77
      86
      ...
      67
      46
      58
      55
      48
      37
      40
      26
      34
      3
    
    
      14000
      50
      35
      53
      52
      45
      32
      44
      37
      69
      62
      ...
      46
      63
      51
      51
      48
      39
      39
      45
      47
      3
    
    
      15000
      71
      53
      58
      68
      73
      60
      45
      46
      63
      75
      ...
      72
      53
      61
      70
      61
      60
      59
      43
      64
      4
    
    
      16000
      66
      52
      68
      66
      69
      61
      49
      59
      78
      79
      ...
      60
      59
      65
      64
      67
      52
      47
      52
      53
      4
    
    
      17000
      60
      32
      60
      50
      56
      51
      47
      34
      69
      56
      ...
      51
      60
      48
      48
      64
      60
      58
      50
      61
      4
    
    
      18000
      74
      67
      76
      68
      74
      83
      48
      64
      85
      80
      ...
      74
      56
      72
      69
      41
      68
      72
      76
      72
      5
    
    
      19000
      67
      38
      40
      67
      67
      40
      35
      21
      78
      76
      ...
      61
      60
      66
      66
      68
      59
      61
      62
      63
      5
    
    
      20000
      57
      39
      43
      50
      58
      47
      39
      44
      71
      72
      ...
      58
      78
      47
      53
      33
      43
      42
      41
      39
      5
    
    
      21000
      67
      43
      58
      40
      63
      60
      55
      39
      72
      59
      ...
      59
      64
      54
      62
      65
      68
      65
      60
      66
      10
    
    
      22000
      58
      23
      23
      45
      52
      24
      38
      26
      77
      73
      ...
      33
      59
      54
      54
      60
      54
      55
      55
      59
      10
    
    
      23000
      70
      57
      61
      65
      68
      58
      61
      53
      78
      80
      ...
      60
      55
      67
      62
      52
      36
      45
      47
      46
      6
    
    
      24000
      62
      50
      56
      54
      57
      56
      55
      49
      59
      81
      ...
      58
      73
      57
      58
      58
      52
      47
      45
      57
      6
    
    
      25000
      70
      53
      64
      70
      69
      61
      51
      33
      78
      67
      ...
      65
      81
      71
      63
      76
      66
      64
      70
      67
      11
    
    
      26000
      62
      22
      38
      41
      43
      23
      39
      29
      44
      62
      ...
      45
      75
      27
      54
      53
      56
      65
      62
      58
      11
    
    
      27000
      64
      36
      35
      38
      62
      33
      46
      34
      76
      60
      ...
      54
      70
      41
      59
      57
      64
      70
      72
      73
      12
    
  

28 rows × 31 columns



In [20]:

    
# perform 5 cross validation
clf = LogisticRegression()
x = df_fifa_all_pos.iloc[:,:-1]
y = df_fifa_all_pos.iloc[:,-1]
log_scores = cross_val_score(clf, x, y, cv=3)
print ('Logistic Regression Accuracy: {}'.format(np.mean(log_scores)))









    



Logistic Regression Accuracy: 0.45080732796136597



In [21]:

    
clf = RandomForestClassifier(random_state=0)
x = df_fifa_all_pos.iloc[:,:-1]
y = df_fifa_all_pos.iloc[:,-1]
rf_scores = cross_val_score(clf, x, y, cv=3)
print ('Random Forest Accuracy: {}'.format(np.mean(rf_scores)))









    



Random Forest Accuracy: 0.32505937892358683

Tune the features by ridge



In [22]:

    
# Perform ridge to get the importance of the feature when determining the position.
clf = Ridge(alpha=0.001)
clf.fit(x,y)
Feature_Coef_list = list(sorted(zip(recol_needed, abs(clf.coef_)),key=lambda x: -x[1]))
Feature_Coef_table = pd.DataFrame(np.array(Feature_Coef_list).reshape(-1,2), columns = ['Attributes', 'Coefficient'])
print(Feature_Coef_table)









    



            Attributes        Coefficient
0              Overall    0.0717080168083
1            Finishing    0.0522304336032
2             Crossing    0.0433288377307
3              Marking    0.0314207616489
4         Ball control    0.0304023274928
5       Sliding tackle    0.0256481212323
6        Short passing    0.0219442536492
7          Positioning    0.0209245070663
8        Interceptions    0.0163144373453
9               Vision    0.0162954778204
10           Penalties    0.0136074977623
11     Standing tackle    0.0131716395467
12          Shot power    0.0112859824342
13               Curve   0.00912983517573
14        Long passing   0.00851227891875
15    Heading accuracy   0.00806576156269
16             Volleys   0.00764793997365
17  Free kick accuracy    0.0076164430895
18          Aggression   0.00754916691825
19           Composure    0.0073492387795
20          Long shots    0.0063430751331
21            Strength   0.00427936412214
22             Stamina   0.00339006056184
23             Balance   0.00321063647673
24             Jumping   0.00225932031214
25        Acceleration    0.0017792794071
26           Dribbling   0.00112446965917
27           Reactions  0.000953019137569
28        Sprint speed  0.000765540510496
29             Agility  0.000505710571809

now we try to enumerate the features to get the highest performance



In [23]:

    
max_score = 0
n_features = 0

for i in range(1,len(Feature_Coef_table['Attributes'])):
    clf_ridge = RandomForestClassifier(random_state=0)
    ridge_cols = Feature_Coef_table[:i]['Attributes'].tolist()
    x_ridge = df_fifa_normalized.iloc[:,:-1][ridge_cols]
    scores_ridge = cross_val_score(clf_ridge, x_ridge,y , cv=3)
    if np.mean(scores_ridge) > max_score:
        max_score = np.mean(scores_ridge)
        n_features = i

print ('Random Forest Accuracy (' + str(n_features) +' features):' + str(max_score))









    



Random Forest Accuracy (22 features):0.395765861155



In [24]:

    
imp_features = Feature_Coef_table[:n_features]['Attributes'].tolist()
print('The important features to determine the positon is')
print(imp_features)









    



The important features to determine the positon is
['Overall', 'Finishing', 'Crossing', 'Marking', 'Ball control', 'Sliding tackle', 'Short passing', 'Positioning', 'Interceptions', 'Vision', 'Penalties', 'Standing tackle', 'Shot power', 'Curve', 'Long passing', 'Heading accuracy', 'Volleys', 'Free kick accuracy', 'Aggression', 'Composure', 'Long shots', 'Strength']

As we can see here. we are improve the accuracy slightly

And 0.395765861155 is higher than baseline 0.07142857142857142

3. Linear Regression

* predict the overall of the player. *

define a new cross validation



In [25]:

    
def cross_Validation_reg(reg, X, y, k = 3):
    
    tMSE = list()

    for train_index, test_index in KFold(n_splits=k, random_state=None, shuffle=False).split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        regm = reg.fit(X_train, y_train)
        tMSE.append(np.mean((y_test - regm.predict(X_test)) ** 2))
    return np.mean(tMSE)



In [26]:

    
## set y overall
overall = np.array(df_fifa.iloc[:,0:1])[:,0]
Xb = csr_matrix(df_fifa.iloc[:, 1:-1])
Xb.toarray()









    Out[26]:





array([[94, 94, 95, ..., 22, 23, 31],
       [94, 87, 92, ..., 30, 38, 45],
       [91, 88, 91, ..., 25, 19, 42],
       ..., 
       [35, 27, 51, ..., 55, 57, 55],
       [38, 53, 39, ..., 29, 36, 30],
       [34, 44, 44, ..., 49, 43, 48]], dtype=int64)



In [27]:

    
class baseline:
    
    def fit(self, X, y):
        return self
    
    def predict(self, X):
        n = X.shape[0]
        res = np.zeros(n)
        for i in range(n):
            res[i] = np.mean(X[i,:])
        return res

# set the baseline class for certain player
bl = baseline()



In [28]:

    
# test baseline for all
cross_Validation_reg(bl, Xb, overall, 5)









    Out[28]:





84.765765665367482

Perform linear model



In [29]:

    
overall = np.array(df_fifa.iloc[:,0:1])[:,0]
X = csr_matrix(df_fifa_all_pos.iloc[:, :])
lr = LinearRegression()

## ignore the positions
accuracy = cross_Validation_reg(lr, Xb,overall, 5)
print('The linear model Accuracy(ignore the positions):' + str(accuracy))

lr = LinearRegression()
## fatorize the positions
accuracy_f = cross_Validation_reg(lr, X, overall, 5)
print('The linear model Accuracy(fatorize the positions):' + str(accuracy_f))









    



The linear model Accuracy(ignore the positions):24.1899472391
The linear model Accuracy(fatorize the positions):21.6724222569



In [30]:

    
lrm = lr.fit(X, overall)
print('The coef are ' + str(lrm.coef_))
print('The intercept is ' + str(lrm.intercept_))









    



The coef are [ 0.50296368 -0.0088104  -0.00452567  0.01011129  0.01096997  0.05972182
 -0.08519162  0.02961537 -0.05606899 -0.04137218 -0.04830314  0.03610987
 -0.01101434 -0.05770672 -0.07897779  0.01758256 -0.05891234 -0.24378331
  0.02476237 -0.27168061  0.02807166  0.07749302 -0.00907431  0.07296686
  0.10880582  0.03459514  0.06388949  0.00142567 -0.04639001 -0.01173554
 -0.25987998]
The intercept is 66.9109023522

Perform polynomial model



In [31]:

    
model = make_pipeline(PolynomialFeatures(2), Ridge(copy_X = False))
accuracy_p = cross_Validation_reg(model, X.toarray(), overall, 5)
print('The polyomial model accuracy (factorize the positions):' + str(accuracy))









    



The polyomial model accuracy (factorize the positions):24.1899472391



In [32]:

    
nX = X.toarray()
modelp = model.fit(nX , overall)
result = modelp.predict(nX)
print('To predict all the player overall rating by our model')
print(result)









    



To predict all the player overall rating by our model
[ 94.00029641  92.00053072  91.00084033 ...,  53.00020395  51.00029448
  47.00097197]

we get a very good accuracy in polynomial model

	Unnamed: 0	Name	Age	Photo	Nationality	Flag	Overall	Potential	Club	Club Logo	...	RB	RCB	RCM	RDM	RF	RM	RS	RW	RWB	ST
0	0	Cristiano Ronaldo	32	https://cdn.sofifa.org/48/18/players/20801.png	Portugal	https://cdn.sofifa.org/flags/38.png	94	94	Real Madrid CF	https://cdn.sofifa.org/24/18/teams/243.png	...	61.0	53.0	82.0	62.0	91.0	89.0	92.0	91.0	66.0	92.0
1	1	L. Messi	30	https://cdn.sofifa.org/48/18/players/158023.png	Argentina	https://cdn.sofifa.org/flags/52.png	93	93	FC Barcelona	https://cdn.sofifa.org/24/18/teams/241.png	...	57.0	45.0	84.0	59.0	92.0	90.0	88.0	91.0	62.0	88.0
2	2	Neymar	25	https://cdn.sofifa.org/48/18/players/190871.png	Brazil	https://cdn.sofifa.org/flags/54.png	92	94	Paris Saint-Germain	https://cdn.sofifa.org/24/18/teams/73.png	...	59.0	46.0	79.0	59.0	88.0	87.0	84.0	89.0	64.0	84.0
3	3	L. Suárez	30	https://cdn.sofifa.org/48/18/players/176580.png	Uruguay	https://cdn.sofifa.org/flags/60.png	92	92	FC Barcelona	https://cdn.sofifa.org/24/18/teams/241.png	...	64.0	58.0	80.0	65.0	88.0	85.0	88.0	87.0	68.0	88.0
4	4	M. Neuer	31	https://cdn.sofifa.org/48/18/players/167495.png	Germany	https://cdn.sofifa.org/flags/21.png	92	92	FC Bayern Munich	https://cdn.sofifa.org/24/18/teams/21.png	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	5	R. Lewandowski	28	https://cdn.sofifa.org/48/18/players/188545.png	Poland	https://cdn.sofifa.org/flags/37.png	91	91	FC Bayern Munich	https://cdn.sofifa.org/24/18/teams/21.png	...	58.0	57.0	78.0	62.0	87.0	82.0	88.0	84.0	61.0	88.0
6	6	De Gea	26	https://cdn.sofifa.org/48/18/players/193080.png	Spain	https://cdn.sofifa.org/flags/45.png	90	92	Manchester United	https://cdn.sofifa.org/24/18/teams/11.png	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
7	7	E. Hazard	26	https://cdn.sofifa.org/48/18/players/183277.png	Belgium	https://cdn.sofifa.org/flags/7.png	90	91	Chelsea	https://cdn.sofifa.org/24/18/teams/5.png	...	59.0	47.0	81.0	61.0	87.0	87.0	82.0	88.0	64.0	82.0
8	8	T. Kroos	27	https://cdn.sofifa.org/48/18/players/182521.png	Germany	https://cdn.sofifa.org/flags/21.png	90	90	Real Madrid CF	https://cdn.sofifa.org/24/18/teams/243.png	...	76.0	72.0	87.0	82.0	81.0	81.0	77.0	80.0	78.0	77.0
9	9	G. Higuaín	29	https://cdn.sofifa.org/48/18/players/167664.png	Argentina	https://cdn.sofifa.org/flags/52.png	90	90	Juventus	https://cdn.sofifa.org/24/18/teams/45.png	...	51.0	46.0	71.0	52.0	84.0	79.0	87.0	82.0	55.0	87.0

	Overall	Finishing	Shot power	Positioning	Dribbling	Long shots	Penalties	Volleys	Acceleration	Agility	...	Composure	Jumping	Crossing	Reactions	Aggression	Interceptions	Marking	Sliding tackle	Standing tackle	Preferred Positions
0	94	94	94	95	91	92	85	88	89	89	...	95	95	85	96	63	29	22	23	31	ST LW
1	93	95	85	93	97	88	74	85	92	90	...	96	68	77	95	48	22	13	26	28	RW
2	92	89	80	90	96	77	81	83	94	96	...	92	61	75	88	56	36	21	33	24	LW
3	92	94	87	92	86	86	85	88	88	86	...	83	69	77	93	78	41	30	38	45	ST
4	92	13	25	12	30	16	47	11	58	52	...	70	78	15	85	29	30	10	11	10	GK
5	91	91	88	91	85	83	81	87	79	78	...	87	84	62	91	80	39	25	19	42	ST
6	90	13	31	12	18	12	40	13	57	60	...	64	67	17	88	38	30	13	13	21	GK
7	90	83	79	85	93	82	86	79	93	93	...	87	59	80	85	54	41	25	22	27	LW
8	90	76	87	79	79	90	73	82	60	71	...	85	32	85	86	60	85	63	69	82	CDM CM
9	90	91	88	92	84	82	70	88	78	75	...	86	79	68	88	50	20	12	18	22	ST

	Overall	Finishing	Shot power	Positioning	Dribbling	Long shots	Penalties	Volleys	Acceleration	Agility	...	Composure	Jumping	Crossing	Reactions	Aggression	Interceptions	Marking	Sliding tackle	Standing tackle	Preferred Positions
0	94	94	94	95	91	92	85	88	89	89	...	95	95	85	96	63	29	22	23	31	ST LW
1	93	95	85	93	97	88	74	85	92	90	...	96	68	77	95	48	22	13	26	28	RW
2	92	89	80	90	96	77	81	83	94	96	...	92	61	75	88	56	36	21	33	24	LW
3	92	94	87	92	86	86	85	88	88	86	...	83	69	77	93	78	41	30	38	45	ST
5	91	91	88	91	85	83	81	87	79	78	...	87	84	62	91	80	39	25	19	42	ST
7	90	83	79	85	93	82	86	79	93	93	...	87	59	80	85	54	41	25	22	27	LW
8	90	76	87	79	79	90	73	82	60	71	...	85	32	85	86	60	85	63	69	82	CDM CM
9	90	91	88	92	84	82	70	88	78	75	...	86	79	68	88	50	20	12	18	22	ST
10	90	60	79	52	61	55	68	66	75	79	...	80	93	66	85	84	88	86	91	89	CB
11	89	83	85	84	85	86	77	82	76	80	...	84	65	90	88	68	56	30	40	51	RM CM CAM

	Overall	Finishing	Shot power	Positioning	Dribbling	Long shots	Penalties	Volleys	Acceleration	Agility	...	Composure	Jumping	Crossing	Reactions	Aggression	Interceptions	Marking	Sliding tackle	Standing tackle	Preferred Positions
0	94	94	94	95	91	92	85	88	89	89	...	95	95	85	96	63	29	22	23	31	ST
1000	70	68	66	75	67	63	67	63	63	68	...	67	72	62	68	70	56	21	25	31	ST
2000	64	53	50	61	67	42	51	48	86	81	...	44	60	59	58	32	17	23	26	24	ST
3000	56	61	53	49	48	49	53	47	56	58	...	49	78	31	53	42	28	22	19	24	ST
4000	62	57	59	60	61	52	57	45	79	70	...	49	54	59	65	41	26	22	20	19	RW
5000	61	46	70	72	60	54	63	67	76	74	...	58	51	47	58	34	13	17	17	17	LW
6000	71	57-3	73-5	63-3	67-1	68-4	69	49	45	68	...	77	73	61-3	72	68	69-4	68	66-3	69-2	CDM
7000	65	60	55	66	63	55	65	51	66	61	...	57	66	66	60	61	55	45	58	64	CDM
8000	78	65	77	59	64	64	50	56	58	52	...	67	74	53	72	77	78	75	76	81	CB
9000	70	22	62	44	58	44	62	21	51	39	...	61	57	57	68	73	72	72	65	70	CB
10000	65	19	40	22	32	13	17	18	53	60	...	56	80	34	54	60	57	62	66	68	CB
11000	59	19	38	29	50	22	32	26	68	58	...	55	64	45	56	56	59	56	56	59	CB
12000	72	65	63	53	79	63	75	68	77	76	...	69	60	64	71	74	28	22	17	18	RM
13000	66	61	68	58	72	65	59	58	77	86	...	67	46	58	55	48	37	40	26	34	RM
14000	50	35	53	52	45	32	44	37	69	62	...	46	63	51	51	48	39	39	45	47	RM
15000	71	53	56+2	68	73	55+5	45	46	63	75	...	72	53	61	70	61	60	59	43	64	CM
16000	66	52	68	66	69	61	49	59	78	79	...	60	59	65	64	67	52	47	52	53	CM
17000	60	32	60	50	56	51	47	34	69	56	...	51	60	48	48	64	60	58	50	61	CM
18000	74	67	76	68	74	83	48	64	85	80	...	74	56	72	69	41	68	72	76	72	LM
19000	67	38	40	67	67	40	35	21	78	76	...	61	60	66	66	68	59	61	62	63	LM
20000	57	39	43	50	58	47	39	44	71	72	...	58	78	47	53	33	43	42	41	39	LM
21000	67	43	58	40	63	60	55	39	72	59	...	59	64	54	62	65	68	65	60	66	LB
22000	58	23	23	45	52	24	38	26	77	73	...	33	59	54	54	60	54	55	55	59	LB
23000	70	57	61	65	68	58	61	53	78	80	...	60	55	67	62	52	36	45	47	46	CAM
24000	62	50	56	54	57	56	55	49	59	81	...	58	73	57	58	58	52	47	45	57	CAM
25000	70	53	64	70	69	61	51	33	78	67	...	65	81	71	63	76	66	64	70	67	RB
26000	62	22	38	41	43	23	39	29	44	62	...	45	75	27	54	53	56	65	62	58	RB
27000	64	36	35	38	62	33	46	34	76	60	...	54	70	41	59	57	64	70	72	73	RWB

	Overall	Finishing	Shot power	Positioning	Dribbling	Long shots	Penalties	Volleys	Acceleration	Agility	...	Composure	Jumping	Crossing	Reactions	Aggression	Interceptions	Marking	Sliding tackle	Standing tackle	Preferred Positions
0	0.039847	0.039847	0.039847	0.040271	0.038576	0.039000	0.036032	0.037304	0.037728	0.037728	...	0.040271	0.040271	0.036032	0.040695	0.026706	0.012293	0.009326	0.009750	0.013141	1
1000	0.037797	0.036717	0.035637	0.040497	0.036177	0.034017	0.036177	0.034017	0.034017	0.036717	...	0.036177	0.038877	0.033477	0.036717	0.037797	0.030238	0.011339	0.013499	0.016739	1
2000	0.040842	0.033823	0.031908	0.038928	0.042757	0.026803	0.032546	0.030632	0.054882	0.051691	...	0.028079	0.038290	0.037652	0.037013	0.020421	0.010849	0.014678	0.016592	0.015316	1
3000	0.039773	0.043324	0.037642	0.034801	0.034091	0.034801	0.037642	0.033381	0.039773	0.041193	...	0.034801	0.055398	0.022017	0.037642	0.029830	0.019886	0.015625	0.013494	0.017045	1
4000	0.039241	0.036076	0.037342	0.037975	0.038608	0.032911	0.036076	0.028481	0.050000	0.044304	...	0.031013	0.034177	0.037342	0.041139	0.025949	0.016456	0.013924	0.012658	0.012025	1
5000	0.038293	0.028876	0.043942	0.045198	0.037665	0.033898	0.039548	0.042059	0.047709	0.046453	...	0.036409	0.032015	0.029504	0.036409	0.021343	0.008161	0.010672	0.010672	0.010672	1
6000	0.036960	0.028110	0.035398	0.031234	0.034357	0.033316	0.035919	0.025508	0.023425	0.035398	...	0.040083	0.038001	0.030193	0.037480	0.035398	0.033837	0.035398	0.032795	0.034878	0
7000	0.036171	0.033389	0.030607	0.036728	0.035058	0.030607	0.036171	0.028381	0.036728	0.033945	...	0.031720	0.036728	0.036728	0.033389	0.033945	0.030607	0.025042	0.032276	0.035615	0
8000	0.038825	0.032354	0.038328	0.029368	0.031857	0.031857	0.024888	0.027875	0.028870	0.025884	...	0.033350	0.036834	0.026381	0.035839	0.038328	0.038825	0.037332	0.037830	0.040319	0
9000	0.042735	0.013431	0.037851	0.026862	0.035409	0.026862	0.037851	0.012821	0.031136	0.023810	...	0.037241	0.034799	0.034799	0.041514	0.044567	0.043956	0.043956	0.039683	0.042735	0
10000	0.044338	0.012960	0.027285	0.015007	0.021828	0.008868	0.011596	0.012278	0.036153	0.040928	...	0.038199	0.054570	0.023192	0.036835	0.040928	0.038881	0.042292	0.045020	0.046385	0
11000	0.039651	0.012769	0.025538	0.019489	0.033602	0.014785	0.021505	0.017473	0.045699	0.038978	...	0.036962	0.043011	0.030242	0.037634	0.037634	0.039651	0.037634	0.037634	0.039651	0
12000	0.038585	0.034834	0.033762	0.028403	0.042337	0.033762	0.040193	0.036442	0.041265	0.040729	...	0.036977	0.032154	0.034298	0.038049	0.039657	0.015005	0.011790	0.009110	0.009646	1
13000	0.037671	0.034817	0.038813	0.033105	0.041096	0.037100	0.033676	0.033105	0.043950	0.049087	...	0.038242	0.026256	0.033105	0.031393	0.027397	0.021119	0.022831	0.014840	0.019406	1
14000	0.034530	0.024171	0.036602	0.035912	0.031077	0.022099	0.030387	0.025552	0.047652	0.042818	...	0.031768	0.043508	0.035221	0.035221	0.033149	0.026934	0.026934	0.031077	0.032459	1
15000	0.038213	0.028525	0.031216	0.036598	0.039290	0.032293	0.024220	0.024758	0.033907	0.040366	...	0.038751	0.028525	0.032831	0.037675	0.032831	0.032293	0.031755	0.023143	0.034446	1
16000	0.035503	0.027972	0.036579	0.035503	0.037117	0.032813	0.026358	0.031737	0.041958	0.042496	...	0.032275	0.031737	0.034965	0.034427	0.036041	0.027972	0.025282	0.027972	0.028510	1
17000	0.036496	0.019465	0.036496	0.030414	0.034063	0.031022	0.028589	0.020681	0.041971	0.034063	...	0.031022	0.036496	0.029197	0.029197	0.038929	0.036496	0.035280	0.030414	0.037105	1
18000	0.036010	0.032603	0.036983	0.033090	0.036010	0.040389	0.023358	0.031144	0.041363	0.038929	...	0.036010	0.027251	0.035036	0.033577	0.019951	0.033090	0.035036	0.036983	0.035036	1
19000	0.038373	0.021764	0.022910	0.038373	0.038373	0.022910	0.020046	0.012027	0.044674	0.043528	...	0.034937	0.034364	0.037801	0.037801	0.038946	0.033792	0.034937	0.035510	0.036082	1
20000	0.036822	0.025194	0.027778	0.032300	0.037468	0.030362	0.025194	0.028424	0.045866	0.046512	...	0.037468	0.050388	0.030362	0.034238	0.021318	0.027778	0.027132	0.026486	0.025194	1
21000	0.037160	0.023849	0.032169	0.022185	0.034942	0.033278	0.030505	0.021631	0.039933	0.032723	...	0.032723	0.035496	0.029950	0.034387	0.036051	0.037715	0.036051	0.033278	0.036606	0
22000	0.039510	0.015668	0.015668	0.030654	0.035422	0.016349	0.025886	0.017711	0.052452	0.049728	...	0.022480	0.040191	0.036785	0.036785	0.040872	0.036785	0.037466	0.037466	0.040191	0
23000	0.037413	0.030465	0.032603	0.034741	0.036344	0.030999	0.032603	0.028327	0.041689	0.042758	...	0.032068	0.029396	0.035810	0.033137	0.027793	0.019241	0.024051	0.025120	0.024586	1
24000	0.034714	0.027996	0.031355	0.030235	0.031915	0.031355	0.030795	0.027436	0.033035	0.045353	...	0.032475	0.040873	0.031915	0.032475	0.032475	0.029115	0.026316	0.025196	0.031915	1
25000	0.035678	0.027013	0.032620	0.035678	0.035168	0.031091	0.025994	0.016820	0.039755	0.034149	...	0.033129	0.041284	0.036188	0.032110	0.038736	0.033639	0.032620	0.035678	0.034149	0
26000	0.042234	0.014986	0.025886	0.027929	0.029292	0.015668	0.026567	0.019755	0.029973	0.042234	...	0.030654	0.051090	0.018392	0.036785	0.036104	0.038147	0.044278	0.042234	0.039510	0
27000	0.039192	0.022045	0.021433	0.023270	0.037967	0.020208	0.028169	0.020821	0.046540	0.036742	...	0.033068	0.042866	0.025107	0.036130	0.034905	0.039192	0.042866	0.044091	0.044703	0