In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show

from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing as pp
from sklearn.model_selection import KFold , cross_val_score
from sklearn.metrics import make_scorer, roc_curve, roc_auc_score

import sklearn.preprocessing as preprocessing

%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500) # to see all columns

In [2]:
data = pd.read_csv('CrowdstormingDataJuly1st.csv')
data_total = data.copy()
print('Number of dayads', data.shape)
data.head()


Number of dayads (146028, 28)
Out[2]:
playerShort player club leagueCountry birthday height weight position games victories ties defeats goals yellowCards yellowReds redCards photoID rater1 rater2 refNum refCountry Alpha_3 meanIAT nIAT seIAT meanExp nExp seExp
0 lucas-wilchez Lucas Wilchez Real Zaragoza Spain 31.08.1983 177.0 72.0 Attacking Midfielder 1 0 0 1 0 0 0 0 95212.jpg 0.25 0.50 1 1 GRC 0.326391 712.0 0.000564 0.396000 750.0 0.002696
1 john-utaka John Utaka Montpellier HSC France 08.01.1982 179.0 82.0 Right Winger 1 0 0 1 0 1 0 0 1663.jpg 0.75 0.75 2 2 ZMB 0.203375 40.0 0.010875 -0.204082 49.0 0.061504
2 abdon-prats Abdón Prats RCD Mallorca Spain 17.12.1992 181.0 79.0 NaN 1 0 1 0 0 1 0 0 NaN NaN NaN 3 3 ESP 0.369894 1785.0 0.000229 0.588297 1897.0 0.001002
3 pablo-mari Pablo Marí RCD Mallorca Spain 31.08.1993 191.0 87.0 Center Back 1 1 0 0 0 0 0 0 NaN NaN NaN 3 3 ESP 0.369894 1785.0 0.000229 0.588297 1897.0 0.001002
4 ruben-pena Rubén Peña Real Valladolid Spain 18.07.1991 172.0 70.0 Right Midfielder 1 1 0 0 0 0 0 0 NaN NaN NaN 3 3 ESP 0.369894 1785.0 0.000229 0.588297 1897.0 0.001002

1. Observation and clean of the data


In [3]:
print('Number of diad: ', len(data))
print('Number of players: ', len(data.playerShort.unique()))
print('Number of referees: ', len(data.refNum.unique()))


Number of diad:  146028
Number of players:  2053
Number of referees:  3147

1.1. Missing values


In [4]:
complete = len(data.dropna())
all_ = len(data_total)
print('Number of row with complete data: {} ({:.3f}%)'.format(complete, (complete/all_ ) * 100 ))
print('Number of row with missing data: {} ({:.3f}%)'.format(all_-complete, (all_ -complete)/all_  * 100 ))


Number of row with complete data: 115457 (79.065%)
Number of row with missing data: 30571 (20.935%)

Where are the missing values ?


In [5]:
def find_col_nan(d):
    col = []
    for c in d.columns:
        if d[c].isnull().any():
            col = np.append(col, c)
    return col

In [6]:
missing_col = find_col_nan(data)
missing_col


Out[6]:
array(['height', 'weight', 'position', 'photoID', 'rater1', 'rater2',
       'Alpha_3', 'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp'], 
      dtype='<U32')

To clean the data, we will go step by step:

- First of all we have to clean all dayads that don't have any rating, because those dayads are uneseful for our problem.
- Then we will look again which columns contains missing values and how to deal with them

In [7]:
data = data[ ~data.rater1.isnull() & ~data.rater2.isnull()]
print('Number of row with the 2 ratings {} ({:.3f}%)'.format(len(data), len(data)/len(data_total) * 100))
onlyOne = data[ ~data.rater1.isnull() ^ ~data.rater2.isnull()]
print('Number of row with only one ratings {} ({:.3f}%)'.format(len(onlyOne), len(onlyOne)/len(data_total) * 100))


Number of row with the 2 ratings 124621 (85.340%)
Number of row with only one ratings 0 (0.000%)

Check now how many incomplete dayads our data contains now


In [8]:
complete = len(data.dropna())
all_ = len(data)
print("After removing data without rating:")
print("-----------------------------------")
print('Number of row with complete data: {} ({:.3f}%)'.format(complete, (complete/all_ ) * 100 ))
print('Number of row with missing data: {} ({:.3f}%)'.format(all_-complete, (all_ -complete)/all_  * 100 ))


After removing data without rating:
-----------------------------------
Number of row with complete data: 115457 (92.647%)
Number of row with missing data: 9164 (7.353%)

Let's check again the columns with missing values


In [9]:
missing_col = find_col_nan(data)
missing_col


Out[9]:
array(['height', 'weight', 'position', 'Alpha_3', 'meanIAT', 'nIAT',
       'seIAT', 'meanExp', 'nExp', 'seExp'], 
      dtype='<U32')

We only 7.4% of incomplete dayads. Let's try to replace the nan in heights and weights with the median


In [10]:
# replace no height and weight with the mean value
median_height = np.median(data['height'].dropna())
median_weight = np.median(data['weight'].dropna())
data['height'] = data['height'].fillna(value=median_height)
data['weight'] = data['weight'].fillna(value=median_weight)

In [11]:
complete = len(data.dropna())
all_ = len(data)
print("After removing data without rating:")
print("-----------------------------------")
print('Number of row with complete data: {} ({:.3f}%)'.format(complete, (complete/all_ ) * 100 ))
print('Number of row with missing data: {} ({:.3f}%)'.format(all_-complete, (all_ -complete)/all_  * 100 ))


After removing data without rating:
-----------------------------------
Number of row with complete data: 116014 (93.093%)
Number of row with missing data: 8607 (6.907%)

In [12]:
missing_col = find_col_nan(data)
missing_col


Out[12]:
array(['position', 'Alpha_3', 'meanIAT', 'nIAT', 'seIAT', 'meanExp',
       'nExp', 'seExp'], 
      dtype='<U32')

The incomplete row only represent 7%. We will check if 2 dayads with referee of the same country, one with complete data and an other one with missing IAT and Exp info exist. In that case it is easy to complete those information


In [13]:
missing_col_test = ['meanIAT', 'nIAT', 'seIAT', 'meanExp',
       'nExp', 'seExp']
exist = False
def checkMissingTest(df):
    for col in missing_col_test:
        nbr_dayads = len(df)
        nbr_noNaN = len(df.dropna(subset=[col]))
        if nbr_dayads > nbr_noNaN &  nbr_noNaN > 0:
            exist = True
            print('There exist valid data for ', df.Alpha_3)
grouped = pd.groupby(data, by='refCountry').apply(checkMissingTest)      
print('Does it exist 2 dayads of same country, one with info on test and one with missing values in test ?: ', exist)


Does it exist 2 dayads of same country, one with info on test and one with missing values in test ?:  False

Look at how many player don't have a position


In [14]:
complete = len(data.dropna(subset=['position']))
all_ = len(data)
print("After removing data without rating:")
print("-----------------------------------")
print('Number of row with complete data: {} ({:.3f}%)'.format(complete, (complete/all_ ) * 100 ))
print('Number of row with missing data: {} ({:.3f}%)'.format(all_-complete, (all_ -complete)/all_  * 100 ))


After removing data without rating:
-----------------------------------
Number of row with complete data: 116160 (93.211%)
Number of row with missing data: 8461 (6.789%)

For those player we will try to assign them a new position called 'Joker'


In [15]:
data.position = data.position.fillna('Joker')

In [16]:
missing_col = find_col_nan(data)
missing_col


Out[16]:
array(['Alpha_3', 'meanIAT', 'nIAT', 'seIAT', 'meanExp', 'nExp', 'seExp'], 
      dtype='<U32')

In [17]:
complete = len(data.dropna())
all_ = len(data)
print("After removing data without rating:")
print("-----------------------------------")
print('Number of row with complete data: {} ({:.3f}%)'.format(complete, (complete/all_ ) * 100 ))
print('Number of row with missing data: {} ({:.3f}%)'.format(all_-complete, (all_ -complete)/all_  * 100 ))


After removing data without rating:
-----------------------------------
Number of row with complete data: 124468 (99.877%)
Number of row with missing data: 153 (0.123%)

The remaining NaN represent only 0.1% and concern the columns about country and the test in those country which are hard to guess. So we deceide to drop the remaining dayad with NaN


In [18]:
data = data.dropna()
find_col_nan(data)


Out[18]:
[]

1.2. Handling different ratings (rater1 vs rater2)

  • We check if there is player with only one rating
  • We also check if one rater always give the same rate to the same player
  • Now we need to take one single rating for each row. We'll need to investigate a bit more the different rating

Is there players with only one rating?


In [19]:
(data.rater1.isnull() | data.rater2.isnull()).any()


Out[19]:
False

Are the raters consistent?


In [20]:
def areRaterConsistent(d):
    for playerID in d.playerShort.unique():
  
        player = d[d.playerShort == playerID]
        rater1 = player.rater1.unique()
        rater2 = player.rater2.unique()
        if len(rater1) >1 or len(rater2) > 1:
            return False
    return True

In [21]:
print("Are the rater consistent: ",areRaterConsistent(data))


Are the rater consistent:  True

Investigation on the raters

Values of the rating


In [22]:
data.rater1.unique()


Out[22]:
array([ 0.25,  0.75,  0.  ,  1.  ,  0.5 ])

Are the raters always agree


In [23]:
print("percentage of players with different ratings: ", len(data[data['rater1'] != data['rater2']])*100 / len(data), "%")


percentage of players with different ratings:  23.18989619821962 %

Let's show some plost to visualize when the rater differe


In [24]:
len(data)


Out[24]:
124468

In [25]:
fig, ax = plt.subplots(1, 4, figsize=(16, 4))
ax[0].hist([data['rater1'], data['rater2']], bins=5)
ax[0].set_title("1) Raters compared \n(blue: rater1, green: rater2)")

ax[1].hist(abs(data['rater1'] - data['rater2']), bins=3)
ax[1].set_title("2) Difference (Rater1 - Rater2)")

dissagree_data = data[data['rater1'] != data['rater2']]
ax[2].hist(dissagree_data['rater1'], bins=5)
ax[2].set_title("3) Dissagreed values Rater1")

ax[3].hist(dissagree_data['rater2'], bins=5, color='seagreen')
ax[3].set_title("4) Dissagreed values Rater2")


Out[25]:
<matplotlib.text.Text at 0x7fe47e67cb00>

We can see that rater1 and 2 disagree the most when they have to rate "white" people.

We can also see with graph2 that when there disagree it's ony of one category.

Now we will create a new columns: color_skin that will be our label to guess. To convert the values of rater 1 and 2 in one rate, we need to follow some rules that come from the graph:

1. if rater1 and rater2 are agree, take that value
2. We can see on graph 4 that when rater2 give 0, usually, rater1 agrees => so when rater2 give 0, we take that value as the color skin
3. In graph 3, when rater1 give 1, rater2 usually agrees => when rater1 give 1, take that value as the color_skin
4. In graph 3, we can see that when rater1 rate1 give 0.25, usually rater2 agrees => take rater1
5. choose at random between both values


Then we can drop the features rater1, rater2 since we don't need them anymore


In [26]:
data['color_rating'] = -1

In [27]:
def color_skin_rules(row):
    #Rule 1
    if row.rater1 == row.rater2:
        return row.rater1
    #Rule2    
    elif row.rater2 == 0:
        return 0
    #Rule 3
    elif row.rater1 == 1:
        return 1
    #Rule 4
    elif row.rater1 == 0.25:
        return 0.25
    else: 
        return np.random.choice([row.rater1, row.rater2])
data.color_rating = data.apply(color_skin_rules, axis=1)

And now make the color_rating categorical:


In [28]:
categorical_dict = {0: 1, 0.25: 2, 0.5: 3, 0.75: 4, 1: 5 }
data['color_rating'] = data.apply(lambda row: categorical_dict[row['color_rating']]
, axis=1).astype('category')

In [29]:
data = data.drop(['rater1', 'rater2'], axis=1)

In [30]:
data_cleaned=data.copy()
data_cleaned.to_csv('CrowdstormingDataJuly1st_preprocessed.csv')
data_cleaned.head()


Out[30]:
playerShort player club leagueCountry birthday height weight position games victories ties defeats goals yellowCards yellowReds redCards photoID refNum refCountry Alpha_3 meanIAT nIAT seIAT meanExp nExp seExp color_rating
0 lucas-wilchez Lucas Wilchez Real Zaragoza Spain 31.08.1983 177.0 72.0 Attacking Midfielder 1 0 0 1 0 0 0 0 95212.jpg 1 1 GRC 0.326391 712.0 0.000564 0.396000 750.0 0.002696 2
1 john-utaka John Utaka Montpellier HSC France 08.01.1982 179.0 82.0 Right Winger 1 0 0 1 0 1 0 0 1663.jpg 2 2 ZMB 0.203375 40.0 0.010875 -0.204082 49.0 0.061504 4
5 aaron-hughes Aaron Hughes Fulham FC England 08.11.1979 182.0 71.0 Center Back 1 0 0 1 0 0 0 0 3868.jpg 4 4 LUX 0.325185 127.0 0.003297 0.538462 130.0 0.013752 1
6 aleksandar-kolarov Aleksandar Kolarov Manchester City England 10.11.1985 187.0 80.0 Left Fullback 1 1 0 0 0 0 0 0 47704.jpg 4 4 LUX 0.325185 127.0 0.003297 0.538462 130.0 0.013752 1
7 alexander-tettey Alexander Tettey Norwich City England 04.04.1986 180.0 68.0 Defensive Midfielder 1 0 0 1 0 0 0 0 22356.jpg 4 4 LUX 0.325185 127.0 0.003297 0.538462 130.0 0.013752 5

Aggregate the data

One solution is to group the data by player name. Then we need to find a solution to correctly group the remaining features:

- club: we have to check if a player appear in 2 different club (in case of a transfer during the winter mercato ) or if the transfer are not taking into account. (-> one (several) hot encoding. or majority dyads per club)
- leagueCountry: same as club
- position: test if the player as different -> position with the majority of game? 
- photoID: drop that information, the id is unique -> not relevant for our classification probleme
- refNum: replace with the total of unique refs
- refCountry: same as refNum
- Alpha_3: remove: redundant information since it correspond to the refCountry
- meanIAT: make new features
    - take mean 
    - take weighted mean (weight with nIAT)
    - take weighted mean (weight with game numers)
- meanExp: same as IAT
- seAIT: 
- seExp:

First do some checks


In [31]:
clubUnique = True
leagueUnique = True
positionUnique = True
def checkFunction(player):

    #check if the club is unique for one player
    if len(player.club.unique()) > 1:
        clubUnique = False
        print(player.player, 'plays for more than one team: ', player.club.unique())
    #check if the leagueCountry is unique
    if len(player.leagueCountry.unique()) > 1:
        leagueUnique = False
        print(player.player, 'plays for more than one league: ', player.leagueCountry.unique())
    #check if the position is unique
    if len(player.position.unique()) > 1:
        positionUnique = False
        print(player.player, 'plays for more than one position: ', player.position.unique())
           
data_group = pd.groupby(data_cleaned, by=data_cleaned.playerShort).apply(checkFunction)
print("Is the club for a player unique? ", clubUnique)
print("Is the league for a player unique? ", leagueUnique)
print("Is the position for a player unique? ", positionUnique)


Is the club for a player unique?  True
Is the league for a player unique?  True
Is the position for a player unique?  True

Then aggregate


In [ ]:


In [32]:
def meanCards(row, test):
    total_cards =   row.yellowCards + row.yellowReds + row.redCards     
    if total_cards == 0:
            return 0
    else:
        if(test == 'IAT'):
            return (row.meanIAT * row.yellowCards) + (row.meanIAT * row.yellowReds)  \
             + (row.meanIAT * row.redCards) / total_cards
        else:  
            return (row.meanExp * row.yellowCards) + (row.meanExp * row.yellowReds)  \
             + (row.meanExp * row.redCards) / total_cards
            

def aggreagtion(df):
    first_entry = df.head(1)
    # new aggregation entry
    new_entry = first_entry.copy()
    
    #sum of the info about the games
    new_entry.games = df.games.sum()
    new_entry.victories = df.victories.sum()
    new_entry.ties = df.ties.sum()
    new_entry.defeats = df.defeats.sum()
    new_entry.goals = df.goals.sum()
    new_entry.yellowCards = df.yellowCards.sum()
    new_entry.yellowReds = df.yellowReds.sum()
    new_entry.redCards = df.redCards.sum()
    
    #drop photoID and alpha_3
    new_entry.drop('photoID', inplace = True, axis=1)
    new_entry.drop('Alpha_3', inplace = True, axis=1)
    
    #refNum: number of unique ref
    new_entry = new_entry.rename(columns = {'refNum': 'refCount'})
    new_entry.refCount = len(df.refNum.unique())
    
    #refCountry: replace by number of unique country
    new_entry = new_entry.rename(columns = {'refCountry': 'refCountryCount'})
    new_entry.refCountryCount = len(df.refCountry.unique())
    
    #==Mean of the test result ===
    #- take mean 
    #- take weighted mean (weight with nIAT)
    #- take weighted mean (weight with game numers)
    
    new_entry.meanIAT = df.meanIAT.mean()
    new_entry.meanExp = df.meanExp.mean()
    
    new_entry['meanIAT_nIAT'] = (df.meanIAT * df.nIAT).sum() / df.nIAT.sum() 
    new_entry['meanExp_nExp'] = (df.meanExp * df.nExp).sum() / df.nExp.sum()
    
    new_entry['meanIAT_GameNbr'] = (df.meanIAT * df.games).sum() / df.games.sum()
    new_entry['meanExp_GameNbr'] = (df.meanExp * df.games).sum() / df.games.sum()
    
    new_entry['meanIAT_cards'] = df.apply(lambda r : meanCards(r, test='IAT'), axis = 1)   
    new_entry['meanExp_cards'] = df.apply(lambda r: meanCards(r, test = 'Exp'), axis = 1)
    
    #????????????????????? DROP nIART nExp or NOT ?????????????????????????????
    new_entry.drop('nIAT', inplace = True, axis=1)
    new_entry.drop('nExp', inplace = True, axis=1)
    
    # standard error = standard deviation / sqrt(n)
    #mean of the standard deviation: mean of the variance and then sqrt
    varIAT = (df.seIAT * np.sqrt(df.nIAT)) ** 2
    new_entry.seIAT = np.sqrt(np.mean(varIAT))/ np.sqrt(df.nIAT)
    varExp = (df.seExp * np.sqrt(df.nExp)) ** 2
    new_entry.seExp = np.sqrt(np.mean(varExp))/ np.sqrt(df.nExp)
    return new_entry
 
data_agregated = pd.groupby(data_cleaned, by="playerShort").apply(aggreagtion)
data_agregated


Out[32]:
playerShort player club leagueCountry birthday height weight position games victories ties defeats goals yellowCards yellowReds redCards refCount refCountryCount meanIAT seIAT meanExp seExp color_rating meanIAT_nIAT meanExp_nExp meanIAT_GameNbr meanExp_GameNbr meanIAT_cards meanExp_cards
playerShort
aaron-hughes 5 aaron-hughes Aaron Hughes Fulham FC England 08.11.1979 182.0 71.0 Center Back 654 247 179 228 9 19 0 0 166 37 0.346459 0.001505 0.494575 0.009691 1 0.328409 0.367721 0.333195 0.400637 0.000000 0.000000
aaron-hunt 745 aaron-hunt Aaron Hunt Werder Bremen Germany 04.09.1986 183.0 73.0 Attacking Midfielder 336 141 73 122 62 42 0 1 99 25 0.348818 0.000834 0.449220 0.003823 2 0.329945 0.441615 0.341438 0.380811 0.000000 0.000000
aaron-lennon 746 aaron-lennon Aaron Lennon Tottenham Hotspur England 16.04.1987 165.0 63.0 Right Midfielder 412 200 97 115 31 11 0 0 101 28 0.345893 0.001113 0.491482 0.006350 2 0.328230 0.365628 0.332389 0.399459 0.000000 0.000000
aaron-ramsey 747 aaron-ramsey Aaron Ramsey Arsenal FC England 26.12.1990 178.0 76.0 Center Midfielder 260 150 42 68 39 31 0 1 104 37 0.346821 0.003786 0.514693 0.015240 1 0.327775 0.412859 0.336638 0.433294 0.000000 0.000000
abdelhamid-el-kaoutari 4418 abdelhamid-el-kaoutari Abdelhamid El-Kaoutari Montpellier HSC France 17.03.1990 180.0 73.0 Center Back 124 41 40 43 1 8 4 2 37 11 0.331600 0.000474 0.335587 0.001745 2 0.338847 0.379497 0.331882 0.328895 0.000000 0.000000
abdou-traore_2 2305 abdou-traore_2 Abdou Traoré Girondins Bordeaux France 17.01.1988 180.0 74.0 Right Midfielder 97 41 23 33 3 11 1 0 42 13 0.320079 0.000889 0.296562 0.002659 4 0.332117 0.366031 0.327985 0.317247 0.000000 0.000000
abdoulaye-diallo_2 47 abdoulaye-diallo_2 Abdoulaye Diallo Stade Rennes France 30.03.1992 189.0 80.0 Goalkeeper 24 8 8 8 0 0 0 0 20 5 0.341625 0.000186 0.400818 0.000774 4 0.337572 0.361394 0.343556 0.428271 0.000000 0.000000
abdoulaye-keita_2 4423 abdoulaye-keita_2 Abdoulaye Keita Girondins Bordeaux France 19.08.1990 188.0 83.0 Goalkeeper 3 0 1 2 0 0 0 0 2 2 0.355406 0.000249 0.417225 0.001029 4 0.341567 0.363284 0.348498 0.390184 0.000000 0.000000
abdoulwhaid-sissoko 4424 abdoulwhaid-sissoko Abdoulwhaid Sissoko Stade Brest France 20.03.1990 180.0 68.0 Defensive Midfielder 121 34 25 62 3 21 0 2 56 3 0.348178 0.000163 0.429630 0.000674 5 0.344426 0.404517 0.344130 0.402314 0.000000 0.000000
abdul-rahman-baba 16617 abdul-rahman-baba Abdul Rahman Baba SpVgg Greuther Fürth Germany 02.07.1994 179.0 70.0 Left Fullback 50 17 8 25 0 3 0 1 27 7 0.342072 0.000103 0.361068 0.000553 4 0.337478 0.339232 0.339733 0.341395 0.000000 0.000000
abdul-razak 3802 abdul-razak Abdul Razak Manchester City England 11.11.1992 180.0 78.0 Center Midfielder 36 16 8 12 0 2 0 0 33 10 0.354566 0.000989 0.536408 0.006177 5 0.329069 0.369694 0.352243 0.521411 0.000000 0.000000
abel-aguilar 2306 abel-aguilar Abel Aguilar Deportivo La Coruña Spain 06.01.1985 185.0 78.0 Defensive Midfielder 246 93 61 92 25 70 3 2 110 22 0.359183 0.000400 0.549105 0.002156 3 0.330335 0.459423 0.362848 0.557574 0.369894 0.588297
abelaziz-barrada 343 abelaziz-barrada Abelaziz Barrada Getafe CF Spain 19.06.1989 185.0 78.0 Joker 85 28 25 32 14 11 1 1 39 18 0.341541 0.004548 0.399237 0.023271 1 0.351721 0.456120 0.356885 0.501552 0.000000 0.000000
abou-diaby 458 abou-diaby Abou Diaby Arsenal FC England 11.05.1986 188.0 75.0 Center Midfielder 208 109 51 48 21 24 0 2 65 21 0.336906 0.003345 0.422867 0.016387 4 0.328151 0.363045 0.331484 0.390292 0.000000 0.000000
adam-bodzek 14239 adam-bodzek Adam Bodzek Fortuna Düsseldorf Germany 07.09.1985 184.0 79.0 Defensive Midfielder 211 88 63 60 9 66 1 0 56 1 0.336628 0.000055 0.335967 0.000225 2 0.336628 0.335967 0.336628 0.335967 0.000000 0.000000
adam-campbell 1884 adam-campbell Adam Campbell Newcastle United England 01.01.1995 168.0 76.0 Joker 20 6 5 9 2 1 0 0 17 7 0.333246 0.000037 0.389978 0.000207 1 0.327112 0.357443 0.332263 0.384948 0.000000 0.000000
adam-federici 1922 adam-federici Adam Federici Reading FC England 31.01.1985 188.0 90.0 Goalkeeper 206 83 54 69 1 4 0 0 81 10 0.330250 0.000042 0.380157 0.000284 1 0.327063 0.413317 0.328090 0.365769 0.000000 0.000000
adam-hlousek 1589 adam-hlousek Adam Hloušek 1. FC Nürnberg Germany 20.12.1988 188.0 76.0 Left Midfielder 131 46 34 51 13 21 0 0 60 17 0.364772 0.000204 0.498535 0.000977 1 0.338019 0.371041 0.369815 0.507140 0.000000 0.000000
adam-johnson 749 adam-johnson Adam Johnson Sunderland AFC England 14.07.1987 175.0 62.0 Left Midfielder 339 145 68 126 55 21 0 0 92 22 0.338806 0.000718 0.418797 0.003206 1 0.327601 0.360663 0.330414 0.374101 0.000000 0.000000
adam-morgan 10385 adam-morgan Adam Morgan Liverpool FC England 21.04.1994 179.0 76.0 Joker 23 8 7 8 1 0 0 0 23 11 0.346997 0.000130 0.448102 0.000357 2 0.327470 0.359985 0.346997 0.448102 0.000000 0.000000
adam-pinter 1590 adam-pinter Ádám Pintér Real Zaragoza Spain 12.06.1988 190.0 76.0 Center Back 82 28 18 36 1 14 1 1 56 23 0.365417 0.000373 0.630062 0.002062 1 0.340622 0.440773 0.366836 0.616819 0.000000 0.000000
adam-smith_3 532 adam-smith_3 Adam Smith Millwall FC England 29.04.1991 173.0 73.0 Right Fullback 131 55 31 45 4 29 2 1 70 11 0.331529 0.000411 0.388481 0.002301 1 0.326793 0.356671 0.329276 0.373564 0.000000 0.000000
adam-szalai 1591 adam-szalai Ádám Szalai 1. FSV Mainz 05 Germany 09.12.1987 193.0 87.0 Center Forward 182 83 40 59 51 18 1 0 70 25 0.347791 0.000223 0.453066 0.001301 2 0.336896 0.365094 0.341194 0.382994 0.000000 0.000000
adan 1592 adan Adán Real Madrid Spain 13.05.1987 190.0 88.0 Goalkeeper 44 22 6 16 0 1 0 1 31 13 0.369184 0.000253 0.583849 0.001631 1 0.348044 0.471570 0.369231 0.584720 0.000000 0.000000
adel-taarabt 1924 adel-taarabt Adel Taarabt Queens Park Rangers England 24.05.1989 180.0 78.0 Left Midfielder 209 67 56 86 41 23 1 0 84 11 0.326478 0.000105 0.335564 0.000441 2 0.327227 0.358152 0.327106 0.350180 0.000000 0.000000
adil-rami 750 adil-rami Adil Rami Valencia CF Spain 27.12.1985 190.0 88.0 Center Back 286 144 71 71 20 59 2 3 101 25 0.354539 0.001044 0.492047 0.004972 2 0.337809 0.401877 0.351279 0.460517 0.325185 0.538462
adrian-colunga 1594 adrian-colunga Adrián Colunga Getafe CF Spain 17.11.1984 171.0 69.0 Center Forward 178 53 43 82 39 22 1 0 47 7 0.365679 0.000167 0.576993 0.000738 2 0.366673 0.572510 0.368781 0.585312 0.000000 0.000000
adrian-mutu 494 adrian-mutu Adrian Mutu AC Ajaccio France 08.01.1979 180.0 74.0 Left Winger 453 199 120 134 178 90 2 3 160 32 0.364151 0.000535 0.533691 0.002496 2 0.334899 0.390277 0.372146 0.519101 0.000000 0.000000
adrian_2 649 adrian_2 Adrián Atlético Madrid Spain 08.01.1988 183.0 73.0 Center Forward 294 145 62 87 60 16 0 0 86 31 0.357048 0.000304 0.517618 0.001735 1 0.329110 0.445014 0.365247 0.563727 0.000000 0.000000
adrian_7 611 adrian_7 Adrián SD Compostela Spain 21.05.1983 188.0 82.0 Goalkeeper 29 10 11 8 0 3 0 0 28 1 0.369894 0.000229 0.588297 0.001002 2 0.369894 0.588297 0.369894 0.588297 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
xisco_2 2225 xisco_2 Xisco Newcastle United England 26.06.1986 188.0 85.0 Center Forward 166 68 28 70 46 26 0 0 72 11 0.361779 0.000048 0.545934 0.000217 2 0.336222 0.407244 0.365399 0.562427 0.000000 0.000000
yacine-brahimi 218 yacine-brahimi Yacine Brahimi Granada CF Spain 08.02.1990 175.0 65.0 Attacking Midfielder 149 59 30 60 19 20 2 0 70 18 0.343521 0.000718 0.425677 0.002091 3 0.345511 0.420015 0.348997 0.450965 0.000000 0.000000
yann-mvila 1869 yann-mvila Yann M'Vila Stade Rennes France 26.06.1990 169.0 56.0 Defensive Midfielder 203 91 52 60 4 29 1 1 86 24 0.358188 0.000193 0.570958 0.000962 3 0.333653 0.386629 0.349619 0.497316 0.000000 0.000000
yannick-djalo 1217 yannick-djalo Yannick Djaló Toulouse FC France 05.05.1986 171.0 62.0 Center Forward 167 73 46 48 29 10 0 0 78 25 0.363715 0.001151 0.582103 0.005573 4 0.328302 0.458565 0.377326 0.667046 0.000000 0.000000
yannik-schulze 8964 yannik-schulze Yannik Schulze Hannover 96 Germany 05.02.1995 186.0 77.0 Center Back 49 20 12 17 3 8 0 0 35 1 0.336628 0.000055 0.335967 0.000225 1 0.336628 0.335967 0.336628 0.335967 0.000000 0.000000
yassine-benzia 4004 yassine-benzia Yassine Benzia Olympique Lyon France 08.09.1994 179.0 71.0 Joker 60 32 15 13 19 7 0 0 43 23 0.337672 0.002019 0.446000 0.010735 2 0.327654 0.454715 0.339181 0.439335 0.295086 0.488235
yassine-el-ghanassi 1218 yassine-el-ghanassi Yassine El Ghanassi West Bromwich Albion England 12.07.1990 173.0 76.0 Left Winger 181 79 42 60 26 15 0 0 67 21 0.360055 0.000956 0.510997 0.004346 3 0.342939 0.428957 0.361042 0.540813 0.000000 0.000000
yassine-jebbour 4871 yassine-jebbour Yassine Jebbour Stade Rennes France 24.08.1991 181.0 70.0 Left Fullback 54 16 15 23 0 2 1 2 30 8 0.343441 0.000184 0.420413 0.000791 4 0.337430 0.371346 0.339549 0.382941 0.000000 0.000000
yaya-toure 236 yaya-toure Yaya Touré Manchester City England 13.05.1983 191.0 90.0 Defensive Midfielder 448 255 83 110 79 82 0 1 152 44 0.346369 0.002599 0.428465 0.014016 5 0.333713 0.393813 0.346490 0.439967 0.000000 0.000000
yoan-gouffran 1220 yoan-gouffran Yoan Gouffran Girondins Bordeaux France 25.05.1986 175.0 70.0 Right Winger 269 121 67 81 59 18 0 1 75 20 0.338854 0.001154 0.415115 0.006546 3 0.329061 0.363175 0.335075 0.367163 0.000000 0.000000
yoann-gourcuff 476 yoann-gourcuff Yoann Gourcuff Olympique Lyon France 11.07.1986 185.0 79.0 Attacking Midfielder 341 178 73 90 51 31 0 2 120 34 0.350000 0.003457 0.466304 0.017888 1 0.329477 0.444544 0.344501 0.410531 0.000000 0.000000
yohan-cabaye 1222 yohan-cabaye Yohan Cabaye Newcastle United England 14.01.1986 175.0 69.0 Defensive Midfielder 372 178 106 88 56 86 1 1 119 30 0.345317 0.001287 0.443883 0.006975 1 0.329930 0.366518 0.337574 0.386732 0.325185 0.538462
yohandry-orozco 488 yohandry-orozco Yohandry Orozco VfL Wolfsburg Germany 19.03.1991 164.0 55.0 Attacking Midfielder 67 25 22 20 14 4 0 0 46 20 0.375346 0.001731 0.656897 0.010125 3 0.327946 0.458521 0.379415 0.717210 0.000000 0.000000
yossi-benayoun 1872 yossi-benayoun Yossi Benayoun West Ham United England 05.05.1980 178.0 70.0 Attacking Midfielder 456 201 105 150 88 31 1 0 147 34 0.353193 0.000281 0.522966 0.001805 2 0.328425 0.412539 0.344829 0.461143 0.000000 0.000000
younes-belhanda 4007 younes-belhanda Younès Belhanda Montpellier HSC France 25.02.1990 174.0 75.0 Attacking Midfielder 173 77 42 54 37 32 2 3 64 24 0.348692 0.002790 0.541128 0.014250 2 0.338342 0.394386 0.342731 0.456223 0.295086 0.488235
younes-kaboul 1223 younes-kaboul Younes Kaboul Tottenham Hotspur England 04.01.1986 190.0 87.0 Center Back 252 110 61 81 18 25 4 2 87 23 0.343357 0.001142 0.444511 0.006415 3 0.328878 0.365134 0.333986 0.391915 0.000000 0.000000
youssef-el-arabi 222 youssef-el-arabi Youssef El-Arabi Granada CF Spain 03.02.1987 180.0 75.0 Center Forward 159 52 47 60 57 15 0 0 67 15 0.335832 0.000537 0.363995 0.002771 3 0.342493 0.388743 0.343162 0.412267 0.000000 0.000000
yunus-malli 9010 yunus-malli Yunus Malli 1. FSV Mainz 05 Germany 24.02.1992 179.0 70.0 Attacking Midfielder 131 59 32 40 24 4 0 0 81 6 0.337992 0.000062 0.356349 0.000276 1 0.336189 0.340789 0.337471 0.348570 0.000000 0.000000
zdenk-pospch 1224 zdenk-pospch Zdeněk Pospěch 1. FSV Mainz 05 Germany 14.12.1978 174.0 72.0 Right Fullback 297 144 67 86 19 26 1 0 104 34 0.353786 0.001586 0.521861 0.007173 1 0.334409 0.385939 0.350481 0.494984 0.000000 0.000000
zdravko-kuzmanovic 34 zdravko-kuzmanovic Zdravko Kuzmanović VfB Stuttgart Germany 22.09.1987 186.0 71.0 Defensive Midfielder 339 156 77 106 38 42 1 1 136 36 0.358571 0.001509 0.500788 0.008648 1 0.336422 0.382447 0.358419 0.462541 0.000000 0.000000
ze-castro 1874 ze-castro Zé Castro Deportivo La Coruña Spain 13.01.1983 183.0 76.0 Center Back 169 77 31 61 3 28 1 1 60 12 0.366725 0.000153 0.570099 0.000688 2 0.353190 0.494750 0.368825 0.581665 0.000000 0.000000
zhi-gin-lam 8716 zhi-gin-lam Zhi Gin Lam Hamburger SV Germany 04.06.1991 175.0 67.0 Right Fullback 127 53 22 52 7 8 0 0 82 1 0.336628 0.000055 0.335967 0.000225 2 0.336628 0.335967 0.336628 0.335967 0.336628 0.335967
zlatan-alomerovic 9184 zlatan-alomerovic Zlatan Alomerović Borussia Dortmund Germany 15.06.1991 187.0 88.0 Goalkeeper 111 50 27 34 0 10 0 1 68 1 0.336628 0.000055 0.335967 0.000225 1 0.336628 0.335967 0.336628 0.335967 0.000000 0.000000
zlatan-ibrahimovic 520 zlatan-ibrahimovic Zlatan Ibrahimović Paris Saint-Germain France 03.10.1981 192.0 84.0 Center Forward 607 377 135 95 330 94 4 6 181 36 0.359884 0.000194 0.508760 0.000943 2 0.339429 0.409919 0.364542 0.497372 0.000000 0.000000
zlatko-junuzovic 2282 zlatko-junuzovic Zlatko Junuzović Werder Bremen Germany 26.09.1987 172.0 69.0 Attacking Midfielder 361 131 90 140 48 53 1 0 113 37 0.351308 0.000570 0.483496 0.002939 1 0.336874 0.388562 0.341763 0.414282 0.000000 0.000000
zoltan-gera 1227 zoltan-gera Zoltán Gera West Bromwich Albion England 22.04.1979 181.0 76.0 Left Winger 392 150 96 146 71 44 1 1 135 36 0.349091 0.001126 0.494216 0.005610 2 0.328085 0.364881 0.336001 0.417374 0.325185 0.538462
zoltan-stieber 3115 zoltan-stieber Zoltán Stieber SpVgg Greuther Fürth Germany 16.10.1988 175.0 67.0 Left Midfielder 142 48 37 57 27 12 0 0 53 6 0.337051 0.000165 0.360397 0.000719 1 0.335765 0.340852 0.336786 0.345085 0.000000 0.000000
zoumana-camara 1228 zoumana-camara Zoumana Camara Paris Saint-Germain France 03.04.1979 182.0 76.0 Center Back 395 148 117 130 7 46 2 6 90 21 0.348260 0.001014 0.426700 0.004507 4 0.331266 0.370416 0.338068 0.363993 0.325185 0.538462
zubikarai 3116 zubikarai Zubikarai Real Sociedad Spain 26.02.1984 185.0 84.0 Goalkeeper 47 14 15 18 0 2 0 2 29 2 0.368884 0.000227 0.591902 0.000992 1 0.367255 0.597637 0.369270 0.590521 0.000000 0.000000
zurutuza 3117 zurutuza Zurutuza Real Sociedad Spain 19.07.1986 186.0 78.0 Defensive Midfielder 160 68 39 53 12 22 0 0 40 6 0.365978 0.000289 0.590718 0.001307 1 0.368534 0.578865 0.368915 0.588902 0.369894 0.588297

1585 rows × 29 columns


In [33]:
data_agregated.to_csv('CrowdstormingDataJuly1st_aggregated.csv')

Encode the data for the Random Forest

The Random Forest can only handle integer or float attributes, so we have to encode the string attributes as numbers.


In [34]:
def encode(data_frame, inplace=False):
    """
    encodes the non numerical columns with a LabelEncoder
    returns a new data_frame if inplace = False, otherwise changes the given data_frame
    """
    _df = data_frame if inplace else data_frame.copy()
    le = pp.LabelEncoder() # encoder
    for col_name in _df.columns:
        if _df.dtypes[col_name] == 'O':
            _df[col_name] = le.fit_transform(_df[col_name])
            print("encoded", col_name)
    return _df

In [35]:
data_cleaned_encoded = encode(data_cleaned, inplace=False)
data_cleaned_encoded.head(1)


encoded playerShort
encoded player
encoded club
encoded leagueCountry
encoded birthday
encoded position
encoded photoID
encoded Alpha_3
Out[35]:
playerShort player club leagueCountry birthday height weight position games victories ties defeats goals yellowCards yellowReds redCards photoID refNum refCountry Alpha_3 meanIAT nIAT seIAT meanExp nExp seExp color_rating
0 901 1046 70 3 1382 177.0 72.0 0 1 0 0 1 0 0 0 0 1532 1 1 59 0.326391 712.0 0.000564 0.396 750.0 0.002696 2

In [36]:
data_cleaned_aggregated_encoded = encode(data_agregated, inplace=False)
data_cleaned_aggregated_encoded.head(1)


encoded playerShort
encoded player
encoded club
encoded leagueCountry
encoded birthday
encoded position
Out[36]:
playerShort player club leagueCountry birthday height weight position games victories ties defeats goals yellowCards yellowReds redCards refCount refCountryCount meanIAT seIAT meanExp seExp color_rating meanIAT_nIAT meanExp_nExp meanIAT_GameNbr meanExp_GameNbr meanIAT_cards meanExp_cards
playerShort
aaron-hughes 5 0 392 34 0 360 182.0 71.0 1 654 247 179 228 9 19 0 0 166 37 0.346459 0.001505 0.494575 0.009691 1 0.328409 0.367721 0.333195 0.400637 0.0 0.0

In [37]:
data_cleaned_encoded.to_csv('CrowdstormingDataJuly1st_preprocessed_encoded.csv')
data_cleaned_aggregated_encoded.set_index("playerShort").to_csv('CrowdstormingDataJuly1st_aggregated_encoded.csv')

In [38]:
data_cleaned_aggregated_encoded.set_index("playerShort").head()


Out[38]:
player club leagueCountry birthday height weight position games victories ties defeats goals yellowCards yellowReds redCards refCount refCountryCount meanIAT seIAT meanExp seExp color_rating meanIAT_nIAT meanExp_nExp meanIAT_GameNbr meanExp_GameNbr meanIAT_cards meanExp_cards
playerShort
0 392 34 0 360 182.0 71.0 1 654 247 179 228 9 19 0 0 166 37 0.346459 0.001505 0.494575 0.009691 1 0.328409 0.367721 0.333195 0.400637 0.0 0.0
1 393 91 2 176 183.0 73.0 0 336 141 73 122 62 42 0 1 99 25 0.348818 0.000834 0.449220 0.003823 2 0.329945 0.441615 0.341438 0.380811 0.0 0.0
2 394 83 0 719 165.0 63.0 11 412 200 97 115 31 11 0 0 101 28 0.345893 0.001113 0.491482 0.006350 2 0.328230 0.365628 0.332389 0.399459 0.0 0.0
3 395 6 0 1199 178.0 76.0 3 260 150 42 68 39 31 0 1 104 37 0.346821 0.003786 0.514693 0.015240 1 0.327775 0.412859 0.336638 0.433294 0.0 0.0
4 396 51 1 758 180.0 73.0 1 124 41 40 43 1 8 4 2 37 11 0.331600 0.000474 0.335587 0.001745 2 0.338847 0.379497 0.331882 0.328895 0.0 0.0

In [ ]: