In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import show
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing as pp
from sklearn.model_selection import KFold , cross_val_score
from sklearn.metrics import make_scorer, roc_curve, roc_auc_score
import sklearn.preprocessing as preprocessing
%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None # default='warn'
pd.set_option('display.max_columns', 500) # to see all columns
In [2]:
data = pd.read_csv('CrowdstormingDataJuly1st.csv')
data_total = data.copy()
print('Number of dayads', data.shape)
data.head()
Out[2]:
In [3]:
print('Number of diad: ', len(data))
print('Number of players: ', len(data.playerShort.unique()))
print('Number of referees: ', len(data.refNum.unique()))
In [4]:
complete = len(data.dropna())
all_ = len(data_total)
print('Number of row with complete data: {} ({:.3f}%)'.format(complete, (complete/all_ ) * 100 ))
print('Number of row with missing data: {} ({:.3f}%)'.format(all_-complete, (all_ -complete)/all_ * 100 ))
In [5]:
def find_col_nan(d):
col = []
for c in d.columns:
if d[c].isnull().any():
col = np.append(col, c)
return col
In [6]:
missing_col = find_col_nan(data)
missing_col
Out[6]:
To clean the data, we will go step by step:
- First of all we have to clean all dayads that don't have any rating, because those dayads are uneseful for our problem.
- Then we will look again which columns contains missing values and how to deal with them
In [7]:
data = data[ ~data.rater1.isnull() & ~data.rater2.isnull()]
print('Number of row with the 2 ratings {} ({:.3f}%)'.format(len(data), len(data)/len(data_total) * 100))
onlyOne = data[ ~data.rater1.isnull() ^ ~data.rater2.isnull()]
print('Number of row with only one ratings {} ({:.3f}%)'.format(len(onlyOne), len(onlyOne)/len(data_total) * 100))
Check now how many incomplete dayads our data contains now
In [8]:
complete = len(data.dropna())
all_ = len(data)
print("After removing data without rating:")
print("-----------------------------------")
print('Number of row with complete data: {} ({:.3f}%)'.format(complete, (complete/all_ ) * 100 ))
print('Number of row with missing data: {} ({:.3f}%)'.format(all_-complete, (all_ -complete)/all_ * 100 ))
Let's check again the columns with missing values
In [9]:
missing_col = find_col_nan(data)
missing_col
Out[9]:
We only 7.4% of incomplete dayads. Let's try to replace the nan in heights and weights with the median
In [10]:
# replace no height and weight with the mean value
median_height = np.median(data['height'].dropna())
median_weight = np.median(data['weight'].dropna())
data['height'] = data['height'].fillna(value=median_height)
data['weight'] = data['weight'].fillna(value=median_weight)
In [11]:
complete = len(data.dropna())
all_ = len(data)
print("After removing data without rating:")
print("-----------------------------------")
print('Number of row with complete data: {} ({:.3f}%)'.format(complete, (complete/all_ ) * 100 ))
print('Number of row with missing data: {} ({:.3f}%)'.format(all_-complete, (all_ -complete)/all_ * 100 ))
In [12]:
missing_col = find_col_nan(data)
missing_col
Out[12]:
The incomplete row only represent 7%. We will check if 2 dayads with referee of the same country, one with complete data and an other one with missing IAT and Exp info exist. In that case it is easy to complete those information
In [13]:
missing_col_test = ['meanIAT', 'nIAT', 'seIAT', 'meanExp',
'nExp', 'seExp']
exist = False
def checkMissingTest(df):
for col in missing_col_test:
nbr_dayads = len(df)
nbr_noNaN = len(df.dropna(subset=[col]))
if nbr_dayads > nbr_noNaN & nbr_noNaN > 0:
exist = True
print('There exist valid data for ', df.Alpha_3)
grouped = pd.groupby(data, by='refCountry').apply(checkMissingTest)
print('Does it exist 2 dayads of same country, one with info on test and one with missing values in test ?: ', exist)
Look at how many player don't have a position
In [14]:
complete = len(data.dropna(subset=['position']))
all_ = len(data)
print("After removing data without rating:")
print("-----------------------------------")
print('Number of row with complete data: {} ({:.3f}%)'.format(complete, (complete/all_ ) * 100 ))
print('Number of row with missing data: {} ({:.3f}%)'.format(all_-complete, (all_ -complete)/all_ * 100 ))
For those player we will try to assign them a new position called 'Joker'
In [15]:
data.position = data.position.fillna('Joker')
In [16]:
missing_col = find_col_nan(data)
missing_col
Out[16]:
In [17]:
complete = len(data.dropna())
all_ = len(data)
print("After removing data without rating:")
print("-----------------------------------")
print('Number of row with complete data: {} ({:.3f}%)'.format(complete, (complete/all_ ) * 100 ))
print('Number of row with missing data: {} ({:.3f}%)'.format(all_-complete, (all_ -complete)/all_ * 100 ))
The remaining NaN represent only 0.1% and concern the columns about country and the test in those country which are hard to guess. So we deceide to drop the remaining dayad with NaN
In [18]:
data = data.dropna()
find_col_nan(data)
Out[18]:
Is there players with only one rating?
In [19]:
(data.rater1.isnull() | data.rater2.isnull()).any()
Out[19]:
Are the raters consistent?
In [20]:
def areRaterConsistent(d):
for playerID in d.playerShort.unique():
player = d[d.playerShort == playerID]
rater1 = player.rater1.unique()
rater2 = player.rater2.unique()
if len(rater1) >1 or len(rater2) > 1:
return False
return True
In [21]:
print("Are the rater consistent: ",areRaterConsistent(data))
Values of the rating
In [22]:
data.rater1.unique()
Out[22]:
Are the raters always agree
In [23]:
print("percentage of players with different ratings: ", len(data[data['rater1'] != data['rater2']])*100 / len(data), "%")
Let's show some plost to visualize when the rater differe
In [24]:
len(data)
Out[24]:
In [25]:
fig, ax = plt.subplots(1, 4, figsize=(16, 4))
ax[0].hist([data['rater1'], data['rater2']], bins=5)
ax[0].set_title("1) Raters compared \n(blue: rater1, green: rater2)")
ax[1].hist(abs(data['rater1'] - data['rater2']), bins=3)
ax[1].set_title("2) Difference (Rater1 - Rater2)")
dissagree_data = data[data['rater1'] != data['rater2']]
ax[2].hist(dissagree_data['rater1'], bins=5)
ax[2].set_title("3) Dissagreed values Rater1")
ax[3].hist(dissagree_data['rater2'], bins=5, color='seagreen')
ax[3].set_title("4) Dissagreed values Rater2")
Out[25]:
We can see that rater1 and 2 disagree the most when they have to rate "white" people.
We can also see with graph2 that when there disagree it's ony of one category.
Now we will create a new columns: color_skin that will be our label to guess. To convert the values of rater 1 and 2 in one rate, we need to follow some rules that come from the graph:
1. if rater1 and rater2 are agree, take that value
2. We can see on graph 4 that when rater2 give 0, usually, rater1 agrees => so when rater2 give 0, we take that value as the color skin
3. In graph 3, when rater1 give 1, rater2 usually agrees => when rater1 give 1, take that value as the color_skin
4. In graph 3, we can see that when rater1 rate1 give 0.25, usually rater2 agrees => take rater1
5. choose at random between both values
Then we can drop the features rater1, rater2 since we don't need them anymore
In [26]:
data['color_rating'] = -1
In [27]:
def color_skin_rules(row):
#Rule 1
if row.rater1 == row.rater2:
return row.rater1
#Rule2
elif row.rater2 == 0:
return 0
#Rule 3
elif row.rater1 == 1:
return 1
#Rule 4
elif row.rater1 == 0.25:
return 0.25
else:
return np.random.choice([row.rater1, row.rater2])
data.color_rating = data.apply(color_skin_rules, axis=1)
And now make the color_rating categorical:
In [28]:
categorical_dict = {0: 1, 0.25: 2, 0.5: 3, 0.75: 4, 1: 5 }
data['color_rating'] = data.apply(lambda row: categorical_dict[row['color_rating']]
, axis=1).astype('category')
In [29]:
data = data.drop(['rater1', 'rater2'], axis=1)
In [30]:
data_cleaned=data.copy()
data_cleaned.to_csv('CrowdstormingDataJuly1st_preprocessed.csv')
data_cleaned.head()
Out[30]:
One solution is to group the data by player name. Then we need to find a solution to correctly group the remaining features:
- club: we have to check if a player appear in 2 different club (in case of a transfer during the winter mercato ) or if the transfer are not taking into account. (-> one (several) hot encoding. or majority dyads per club)
- leagueCountry: same as club
- position: test if the player as different -> position with the majority of game?
- photoID: drop that information, the id is unique -> not relevant for our classification probleme
- refNum: replace with the total of unique refs
- refCountry: same as refNum
- Alpha_3: remove: redundant information since it correspond to the refCountry
- meanIAT: make new features
- take mean
- take weighted mean (weight with nIAT)
- take weighted mean (weight with game numers)
- meanExp: same as IAT
- seAIT:
- seExp:
First do some checks
In [31]:
clubUnique = True
leagueUnique = True
positionUnique = True
def checkFunction(player):
#check if the club is unique for one player
if len(player.club.unique()) > 1:
clubUnique = False
print(player.player, 'plays for more than one team: ', player.club.unique())
#check if the leagueCountry is unique
if len(player.leagueCountry.unique()) > 1:
leagueUnique = False
print(player.player, 'plays for more than one league: ', player.leagueCountry.unique())
#check if the position is unique
if len(player.position.unique()) > 1:
positionUnique = False
print(player.player, 'plays for more than one position: ', player.position.unique())
data_group = pd.groupby(data_cleaned, by=data_cleaned.playerShort).apply(checkFunction)
print("Is the club for a player unique? ", clubUnique)
print("Is the league for a player unique? ", leagueUnique)
print("Is the position for a player unique? ", positionUnique)
Then aggregate
In [ ]:
In [32]:
def meanCards(row, test):
total_cards = row.yellowCards + row.yellowReds + row.redCards
if total_cards == 0:
return 0
else:
if(test == 'IAT'):
return (row.meanIAT * row.yellowCards) + (row.meanIAT * row.yellowReds) \
+ (row.meanIAT * row.redCards) / total_cards
else:
return (row.meanExp * row.yellowCards) + (row.meanExp * row.yellowReds) \
+ (row.meanExp * row.redCards) / total_cards
def aggreagtion(df):
first_entry = df.head(1)
# new aggregation entry
new_entry = first_entry.copy()
#sum of the info about the games
new_entry.games = df.games.sum()
new_entry.victories = df.victories.sum()
new_entry.ties = df.ties.sum()
new_entry.defeats = df.defeats.sum()
new_entry.goals = df.goals.sum()
new_entry.yellowCards = df.yellowCards.sum()
new_entry.yellowReds = df.yellowReds.sum()
new_entry.redCards = df.redCards.sum()
#drop photoID and alpha_3
new_entry.drop('photoID', inplace = True, axis=1)
new_entry.drop('Alpha_3', inplace = True, axis=1)
#refNum: number of unique ref
new_entry = new_entry.rename(columns = {'refNum': 'refCount'})
new_entry.refCount = len(df.refNum.unique())
#refCountry: replace by number of unique country
new_entry = new_entry.rename(columns = {'refCountry': 'refCountryCount'})
new_entry.refCountryCount = len(df.refCountry.unique())
#==Mean of the test result ===
#- take mean
#- take weighted mean (weight with nIAT)
#- take weighted mean (weight with game numers)
new_entry.meanIAT = df.meanIAT.mean()
new_entry.meanExp = df.meanExp.mean()
new_entry['meanIAT_nIAT'] = (df.meanIAT * df.nIAT).sum() / df.nIAT.sum()
new_entry['meanExp_nExp'] = (df.meanExp * df.nExp).sum() / df.nExp.sum()
new_entry['meanIAT_GameNbr'] = (df.meanIAT * df.games).sum() / df.games.sum()
new_entry['meanExp_GameNbr'] = (df.meanExp * df.games).sum() / df.games.sum()
new_entry['meanIAT_cards'] = df.apply(lambda r : meanCards(r, test='IAT'), axis = 1)
new_entry['meanExp_cards'] = df.apply(lambda r: meanCards(r, test = 'Exp'), axis = 1)
#????????????????????? DROP nIART nExp or NOT ?????????????????????????????
new_entry.drop('nIAT', inplace = True, axis=1)
new_entry.drop('nExp', inplace = True, axis=1)
# standard error = standard deviation / sqrt(n)
#mean of the standard deviation: mean of the variance and then sqrt
varIAT = (df.seIAT * np.sqrt(df.nIAT)) ** 2
new_entry.seIAT = np.sqrt(np.mean(varIAT))/ np.sqrt(df.nIAT)
varExp = (df.seExp * np.sqrt(df.nExp)) ** 2
new_entry.seExp = np.sqrt(np.mean(varExp))/ np.sqrt(df.nExp)
return new_entry
data_agregated = pd.groupby(data_cleaned, by="playerShort").apply(aggreagtion)
data_agregated
Out[32]:
In [33]:
data_agregated.to_csv('CrowdstormingDataJuly1st_aggregated.csv')
In [34]:
def encode(data_frame, inplace=False):
"""
encodes the non numerical columns with a LabelEncoder
returns a new data_frame if inplace = False, otherwise changes the given data_frame
"""
_df = data_frame if inplace else data_frame.copy()
le = pp.LabelEncoder() # encoder
for col_name in _df.columns:
if _df.dtypes[col_name] == 'O':
_df[col_name] = le.fit_transform(_df[col_name])
print("encoded", col_name)
return _df
In [35]:
data_cleaned_encoded = encode(data_cleaned, inplace=False)
data_cleaned_encoded.head(1)
Out[35]:
In [36]:
data_cleaned_aggregated_encoded = encode(data_agregated, inplace=False)
data_cleaned_aggregated_encoded.head(1)
Out[36]:
In [37]:
data_cleaned_encoded.to_csv('CrowdstormingDataJuly1st_preprocessed_encoded.csv')
data_cleaned_aggregated_encoded.set_index("playerShort").to_csv('CrowdstormingDataJuly1st_aggregated_encoded.csv')
In [38]:
data_cleaned_aggregated_encoded.set_index("playerShort").head()
Out[38]:
In [ ]: