In [1]:
%run "../Functions/4. User comparison.ipynb"


rmdf152 read_csv success
../Utilities/Preparation.ipynb:2: ParserWarning: Both a converter and dtype were specified for column customData.localplayerguid - only the converter will be used
  "cells": [
gformEN read_csv success
gformFR read_csv success
temporalities set

Prepare data

Prepare Google form data


In [2]:
# Rename columns of the Google Forms table with tags independantly of form language
columnTags = ["timestamp", "gameInterest", "gameFrequency", "age", "gender", "biologyStudy", "biologyInterest", "synthBioKnowledge", "biobrickKnowledge", "previousVersion", "previousPlay", "arcadePlay", "androidPlay", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "Q12", "Q13", "Q14", "Q15", "Q16", "Q17", "Q18", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "Q27", "comments", "anonymousID", "lang", "temporality"]
columnQuestions = gform.columns.values.tolist()
googleData = gform.rename(columns=dict(zip(columnQuestions, columnTags)))
#googleData.head()

In [3]:
# Replaces answers to scientific questions in the questionnaires by their values (True or False)
correctedData = googleData.copy()
for rowId in range(correctedData.shape[0]):
    # Get the correction for each subject
    playerId = correctedData.loc[rowId, "anonymousID"]
    correction = getCorrections(playerId)
    if correction.shape[1] > 0:
        # If subject has answered questionnaire
        correction = correction.rename(index=dict(zip(columnQuestions, columnTags)))
        # Replace scientific answers by their correction
        for questionId in range(27):
            questionTag = "Q" + str(questionId + 1)
            correctedData.loc[rowId, questionTag] = int(correction.loc[questionTag, "corrections" + str(rowId)])
    
#correctedData.tail(15)

In [4]:
list(zip(columnQuestions, columnTags))


Out[4]:
[('Timestamp', 'timestamp'),
 ('Are you interested in video games?', 'gameInterest'),
 ('Do you play video games?', 'gameFrequency'),
 ('How old are you?', 'age'),
 ('What is your gender?', 'gender'),
 ('How long have you studied biology?', 'biologyStudy'),
 ('Are you interested in biology?', 'biologyInterest'),
 ('Before playing Hero.Coli, had you ever heard about synthetic biology?',
  'synthBioKnowledge'),
 ('Before playing Hero.Coli, had you ever heard about BioBricks?',
  'biobrickKnowledge'),
 ('Have you ever played an older version of Hero.Coli before?',
  'previousVersion'),
 ('Have you played the current version of Hero.Coli?', 'previousPlay'),
 ('Have you played the arcade cabinet version of Hero.Coli?', 'arcadePlay'),
 ('Have you played the Android version of Hero.Coli?', 'androidPlay'),
 ('In order to modify the abilities of the bacterium, you have to...', 'Q1'),
 ('What are BioBricks and devices?', 'Q2'),
 ('What is the name of this BioBrick? TER', 'Q3'),
 ('What is the name of this BioBrick? PR', 'Q4'),
 ('What is the name of this BioBrick? CDS', 'Q5'),
 ('What is the name of this BioBrick? RBS', 'Q6'),
 ('What does this BioBrick do? TER', 'Q7'),
 ('What does this BioBrick do? PR', 'Q8'),
 ('What does this BioBrick do? CDS', 'Q9'),
 ('What does this BioBrick do? RBS', 'Q10'),
 ('Pick the case where the BioBricks are well-ordered:', 'Q11'),
 ('When does green fluorescence happen?', 'Q12'),
 ('What happens when you unequip the movement device?', 'Q13'),
 ('What is this? PLASMID', 'Q14'),
 ('What does this device do? PCONS:RBS:GFP:TER', 'Q15'),
 ('What does this device do? PCONS:RBS:FLHDC:TER', 'Q16'),
 ('What does this device do? PCONS:RBS:AMPR:TER', 'Q17'),
 ('What does this device do? PBAD:RBS:GFP:TER', 'Q18'),
 ('What does this device do? PCONS:RBS:GFP:TER 2', 'Q19'),
 ('What does this device do? PCONS:RBS:FLHDC:TER 2', 'Q20'),
 ('What does this device do? PCONS:RBS:AMPR:TER 2', 'Q21'),
 ('What does this device do? PBAD:RBS:GFP:TER 2', 'Q22'),
 ('Guess: what would a device producing l-arabinose do, if it started with a l-arabinose-induced promoter?',
  'Q23'),
 ('Guess: the bacterium would glow yellow...', 'Q24'),
 ('What is the species of the bacterium of the game?', 'Q25'),
 ('What is the scientific name of the tails of the bacterium?', 'Q26'),
 ('Find the antibiotic:', 'Q27'),
 ('You can write down remarks here.', 'comments'),
 ('Do not edit -  pre-filled anonymous ID', 'anonymousID'),
 ('Language', 'lang'),
 ('Temporality', 'temporality')]

For association rule mining


In [5]:
# Get only answers to scientific questions
correctedScientific = correctedData.loc[:, "Q1":"Q27"]
#correctedScientific.head()

For clustering

With full answers


In [6]:
# Remove timestamp and comments features
codedData = googleData.copy().drop(['timestamp', "comments"], axis=1)
codedData.head()


Out[6]:
gameInterest gameFrequency age gender biologyStudy biologyInterest synthBioKnowledge biobrickKnowledge previousVersion previousPlay ... Q21 Q22 Q23 Q24 Q25 Q26 Q27 anonymousID lang temporality
0 Extremely Extremely 23 Female Until bachelor's degree Extremely Yes NaN No Yes ... It generates antibiotic resistance It generates green fluorescence in presence of... After being induced, it would produce more and... I don't know E. Coli Flagella Ampicillin 8d352896-a3f1-471c-8439-0f426df901c1 en before
1 Moderately Moderately 28 Other Until the end of high school Moderately Yes NaN No Yes ... It generates antibiotic resistance It generates green fluorescence in presence of... After being induced, it would produce more and... I don't know E. Coli Flagella Ampicillin 7037c5b2-c286-498e-9784-9a061c778609 en after
2 A lot Moderately 20 Female Until bachelor's degree Moderately No NaN No Yes ... It generates antibiotic resistance It generates green fluorescence I don't know If it produces BFP under purple light E. Coli Flagella Ampicillin 5c4939b5-425b-4d19-b5d2-0384a515539e en after
3 Moderately Moderately 21 Male Until bachelor's degree A lot Yes NaN No Yes ... It generates antibiotic resistance It generates green fluorescence in presence of... After being induced, it would produce more and... If it produces YFP under cyan light E. Coli Flagella Ampicillin acb9c989-b4a6-4c4d-81cc-6b5783ec71d8 en before
4 Moderately Rarely 18 Female Until bachelor's degree A lot No No No No ... I don't know I don't know I don't know If it produced YFP under cyan light I don't know I don't know Ampicillin 1a03bc9e-bed4-4ddd-be7f-af23b1d5eb65 en before

5 rows × 42 columns


In [7]:
# Code answers with integers when possible

# Define equivalences
# gameInterest
gameInterestCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Slightly": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
# gameFrequency
gameFrequencyCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Rarely": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
# biologyStudy
biologyStudyCoding = {"Not even in middle school": 0, "Jamais": 0, "Jamais, pas même au collège": 0, "Until the end of middle school": 1, "Jusqu'au brevet": 1, "Until the end of high school": 2, "Jusqu'au bac": 2, "Until bachelor's degree": 3, "Jusqu'à la license": 3, "At least until master's degree": 4, "Au moins jusqu'au master": 4, "I don't know": 0, "Je ne sais pas": 0}
# biologyInterest
biologyInterestCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Slightly": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
# synthBioKnowledge
# biobrickKnowledge
previousKnowledgeCoding = {"Yes": 1, "No": 0, "I don't know": 0, "Oui": 1, 'Non': 0, "Je ne sais pas": 0}
# previousVersion
# previousPlay
# arcadePlay
# androidPlay
previousPlayCoding = {"Multiple times": 3, "A few times": 2, "Once": 1, "Yes": 1, "No": 0, "I don't know": 0, "De nombreuses fois": 3, "Quelques fois": 2, "Une fois": 1, "Oui": 1, "Non": 0, "Je ne sais pas": 0}
# lang
languageCoding = {"en": 0, "fr": 1}
# temporality
temporalityCoding = {"before": 0, "after": 1, "undefined": -5}

# Fill NaN cells
codedData["biobrickKnowledge"].fillna("I don't know", inplace = True)
codedData["arcadePlay"].fillna("I don't know", inplace = True)
codedData["androidPlay"].fillna("I don't know", inplace = True)
codedData["previousPlay"].fillna("I don't know", inplace = True)

# Replace by code
for rowId in range(codedData.shape[0]):
    codedData.loc[rowId, "gameInterest"] = gameInterestCoding[codedData.loc[rowId, "gameInterest"]]
    codedData.loc[rowId, "gameFrequency"] = gameFrequencyCoding[codedData.loc[rowId, "gameFrequency"]]
    codedData.loc[rowId, "biologyStudy"] = biologyStudyCoding[codedData.loc[rowId, "biologyStudy"]]
    codedData.loc[rowId, "biologyInterest"] = biologyInterestCoding[codedData.loc[rowId, "biologyInterest"]]
    codedData.loc[rowId, "synthBioKnowledge"] = previousKnowledgeCoding[codedData.loc[rowId, "synthBioKnowledge"]]
    codedData.loc[rowId, "biobrickKnowledge"] = previousKnowledgeCoding[codedData.loc[rowId, "biobrickKnowledge"]]
    codedData.loc[rowId, "previousVersion"] = previousPlayCoding[codedData.loc[rowId, "previousVersion"]]
    codedData.loc[rowId, "previousPlay"] = previousPlayCoding[codedData.loc[rowId, "previousPlay"]]
    codedData.loc[rowId, "arcadePlay"] = previousPlayCoding[codedData.loc[rowId, "arcadePlay"]]
    codedData.loc[rowId, "androidPlay"] = previousPlayCoding[codedData.loc[rowId, "androidPlay"]]
    codedData.loc[rowId, "lang"] = languageCoding[codedData.loc[rowId, "lang"]]
    codedData.loc[rowId, "temporality"] = temporalityCoding[codedData.loc[rowId, "temporality"]]

In [8]:
# One-Hot version
codedData = pd.get_dummies(codedData, prefix = ["gender", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "Q12", "Q13", "Q14", "Q15", "Q16", "Q17", "Q18", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "Q27"], columns = ["gender", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "Q12", "Q13", "Q14", "Q15", "Q16", "Q17", "Q18", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "Q27"])
#codedData.head()

In [9]:
# Split the forms according to temporality
beforeForms = codedData.copy().loc[codedData["temporality"] == 0,:]
afterForms = codedData.copy().loc[codedData["temporality"] == 1,:]
undefForms = codedData.copy().loc[codedData["temporality"] == -5,:]
defForms = codedData.copy().loc[codedData["temporality"] >= 0,:] # Either before or after
# For subjects with both before and after forms, join the two
beforeAndAfterForms = pd.merge(beforeForms, afterForms, on="anonymousID", suffixes=('_before', '_after'))
#beforeForms.head()
#afterForms.head()
#undefForms.head()
#defForms.head()
#beforeAndAfterForms.head()

In [10]:
# Remove ID feature
allForms = codedData.copy().drop("anonymousID", axis=1)
beforeForms.drop("anonymousID", axis=1, inplace = True)
afterForms.drop("anonymousID", axis=1, inplace = True)
undefForms.drop("anonymousID", axis=1, inplace = True)
defForms.drop("anonymousID", axis=1, inplace = True)
beforeAndAfterForms.drop("anonymousID", axis=1, inplace = True)

With corrected scientific answers


In [11]:
# Remove timestamp and comments features
codedCorrectedData = correctedData.copy().drop(['timestamp', "comments"], axis=1)

In [12]:
# Fill NaN cells
codedCorrectedData["biobrickKnowledge"].fillna("I don't know", inplace = True)
codedCorrectedData["arcadePlay"].fillna("I don't know", inplace = True)
codedCorrectedData["androidPlay"].fillna("I don't know", inplace = True)
codedCorrectedData["previousPlay"].fillna("I don't know", inplace = True)

# Replace by code
for rowId in range(codedData.shape[0]):
    codedCorrectedData.loc[rowId, "gameInterest"] = gameInterestCoding[codedCorrectedData.loc[rowId, "gameInterest"]]
    codedCorrectedData.loc[rowId, "gameFrequency"] = gameFrequencyCoding[codedCorrectedData.loc[rowId, "gameFrequency"]]
    codedCorrectedData.loc[rowId, "biologyStudy"] = biologyStudyCoding[codedCorrectedData.loc[rowId, "biologyStudy"]]
    codedCorrectedData.loc[rowId, "biologyInterest"] = biologyInterestCoding[codedCorrectedData.loc[rowId, "biologyInterest"]]
    codedCorrectedData.loc[rowId, "synthBioKnowledge"] = previousKnowledgeCoding[codedCorrectedData.loc[rowId, "synthBioKnowledge"]]
    codedCorrectedData.loc[rowId, "biobrickKnowledge"] = previousKnowledgeCoding[codedCorrectedData.loc[rowId, "biobrickKnowledge"]]
    codedCorrectedData.loc[rowId, "previousVersion"] = previousPlayCoding[codedCorrectedData.loc[rowId, "previousVersion"]]
    codedCorrectedData.loc[rowId, "previousPlay"] = previousPlayCoding[codedCorrectedData.loc[rowId, "previousPlay"]]
    codedCorrectedData.loc[rowId, "arcadePlay"] = previousPlayCoding[codedCorrectedData.loc[rowId, "arcadePlay"]]
    codedCorrectedData.loc[rowId, "androidPlay"] = previousPlayCoding[codedCorrectedData.loc[rowId, "androidPlay"]]
    codedCorrectedData.loc[rowId, "lang"] = languageCoding[codedCorrectedData.loc[rowId, "lang"]]
    codedCorrectedData.loc[rowId, "temporality"] = temporalityCoding[codedCorrectedData.loc[rowId, "temporality"]]

In [13]:
# One-Hot versions of the above dataframes
codedCorrectedData = pd.get_dummies(codedCorrectedData, prefix = ["gender"], columns = ["gender"])
codedCorrectedData.head()


Out[13]:
gameInterest gameFrequency age biologyStudy biologyInterest synthBioKnowledge biobrickKnowledge previousVersion previousPlay arcadePlay ... Q25 Q26 Q27 anonymousID lang temporality gender_Female gender_Male gender_Other gender_Prefer not to say
0 5 5 23 3 5 1 0 0 1 0 ... 1 1 1 8d352896-a3f1-471c-8439-0f426df901c1 0 0 1 0 0 0
1 3 3 28 2 3 1 0 0 1 0 ... 1 1 1 7037c5b2-c286-498e-9784-9a061c778609 0 1 0 0 1 0
2 4 3 20 3 3 0 0 0 1 0 ... 1 1 1 5c4939b5-425b-4d19-b5d2-0384a515539e 0 1 1 0 0 0
3 3 3 21 3 4 1 0 0 1 0 ... 1 1 1 acb9c989-b4a6-4c4d-81cc-6b5783ec71d8 0 0 0 1 0 0
4 3 2 18 3 4 0 0 0 0 0 ... 0 0 1 1a03bc9e-bed4-4ddd-be7f-af23b1d5eb65 0 0 1 0 0 0

5 rows × 45 columns


In [14]:
# Split the forms according to temporality
beforeCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 0,:]
afterCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 1,:]
undefCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == -5,:]
defCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] >= 0,:] # Either before or after
# For subjects with both before and after forms, join the two
beforeAndAfterCorrectedForms = pd.merge(beforeCorrectedForms, afterCorrectedForms, on="anonymousID", suffixes=('_before', '_after'))
#beforeCorrectedForms.head()
#afterCorrectedForms.head()
#undefCorrectedForms.head()
#defCorrectedForms.head()
#beforeAndAfterCorrectedForms.head()

In [15]:
# Remove ID feature
allCorrectedForms = codedCorrectedData.copy().drop("anonymousID", axis=1)
beforeCorrectedForms.drop("anonymousID", axis=1, inplace = True)
afterCorrectedForms.drop("anonymousID", axis=1, inplace = True)
undefCorrectedForms.drop("anonymousID", axis=1, inplace = True)
defCorrectedForms.drop("anonymousID", axis=1, inplace = True)
beforeAndAfterCorrectedForms.drop("anonymousID", axis=1, inplace = True)

For Classification


In [16]:
# Use defForms and defCorrectedForms for coded data

RedMetrics

For clustering


In [17]:
# Fetch RedMetrics data for subjects which answered the gform
allData = getAllUserVectorData(getAllResponders(), _source=[])


C:\Users\Mikael\Anaconda3\envs\conda-env-python3-py\lib\site-packages\ipykernel_launcher.py:9: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  if __name__ == '__main__':

In [18]:
# Put subjects as rows and features as columns
allData = allData.transpose()
allData.head()


Out[18]:
sessionsCount scorebefore scoreafter scoreundefined complete configure craft death equip unequip ... 5 6 7 8 9 10 11 12 13 14
8d352896-a3f1-471c-8439-0f426df901c1 1.0 18.0 NaN NaN 0.0 0.0 5.0 5.0 16.0 1.0 ... 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09
7037c5b2-c286-498e-9784-9a061c778609 2.0 NaN 22.0 NaN 2.0 0.0 45.0 25.0 3.0 4.0 ... 3.686890e+02 1.333480e+02 1.753140e+02 1.830370e+02 5.475110e+02 1.582290e+02 6.438000e+00 2.750300e+01 1.550830e+02 1.422350e+02
5c4939b5-425b-4d19-b5d2-0384a515539e 1.0 NaN 15.0 NaN 0.0 0.0 7.0 11.0 25.0 3.0 ... 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09
acb9c989-b4a6-4c4d-81cc-6b5783ec71d8 1.0 23.0 NaN NaN 0.0 1.0 11.0 11.0 16.0 3.0 ... 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09 9.223372e+09
1a03bc9e-bed4-4ddd-be7f-af23b1d5eb65 1.0 3.0 NaN NaN 0.0 1.0 10.0 118.0 133.0 7.0 ... 6.856270e+02 4.544890e+02 1.379500e+02 2.749810e+02 5.320300e+01 6.418300e+01 3.634000e+00 4.095000e+00 9.223372e+09 9.223372e+09

5 rows × 40 columns


In [19]:
# Drop useless columns in which all rows have the same value
allData.drop(["switch", "gotomooc"], axis=1, inplace = True)
# Fill NaN with negatives
allData.fillna(-1, inplace = True)

In [20]:
# Get the subset of subjects who have answered before and after
fullProcessData = allData[allData["scorebefore"] >= 0]
fullProcessData = fullProcessData[fullProcessData["scoreafter"] >= 0]
#fullProcessData.loc["01e85778-2903-447b-bbab-dd750564ee2d",:]

Combined with questionnaire answers


In [21]:
#Get the subset of subjects who have answered the after questionnaire
withAfterData = allData.copy()[allData["scoreafter"] >= 0]
withAfterData['anonymousID'] = withAfterData.index
withAfterData['anonymousID'] = withAfterData['anonymousID'].apply(lambda x: x.strip('"'))
# Join with questionnaire data with complete answers
afterNotCorrected = codedData.copy().loc[codedData["temporality"] == 1,:]
gameAndAfterData = pd.merge(withAfterData, afterNotCorrected, on="anonymousID")
gameAndAfterData.drop("anonymousID", axis=1, inplace = True)
#Join with questionnaire data with corrected answers
afterCorrected = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 1,:]
gameAndCorrectedAfterData = pd.merge(withAfterData, afterCorrected, on="anonymousID")
gameAndCorrectedAfterData.drop("anonymousID", axis=1, inplace = True)
#gameAndAfterData.head()
#gameAndCorrectedAfterData.head()

For classification


In [22]:
allDataClassif = allData.copy()
allDataClassif['anonymousID'] = allData.index
allDataClassif['anonymousID'] = allDataClassif['anonymousID'].apply(lambda x: x.strip('"'))

# If checkpoint not reached, set time to 3600 (1h)
def floorCheckpoints(value):
    if value > 3600:
        return 3600
    return value
for col in allDataClassif.columns.values.tolist():
    if isinstance(col, np.int64):
        allDataClassif[col] = allDataClassif[col].apply(floorCheckpoints)
allDataClassif["completionTime"] = allDataClassif["completionTime"].apply(lambda x: min(x, 7200))        

# floor thoroughness
allDataClassif["thoroughness"]  = allDataClassif["thoroughness"].apply(lambda x: min(x, 1000))

Combined with questionnaire answers


In [23]:
#Join with questionnaire data with corrected answers
gameAndCorrectedAfterDataClassif = pd.merge(withAfterData, afterCorrected, on="anonymousID")
gameAndCorrectedAfterDataClassif['anonymousID'] = gameAndCorrectedAfterDataClassif['anonymousID'].apply(lambda x: x.strip('"'))
gameAndCorrectedAfterDataClassif.drop(["scorebefore", "scoreundefined", "temporality"], axis=1, inplace = True)

for col in gameAndCorrectedAfterDataClassif.columns.values.tolist():
    if isinstance(col, np.int64):
        gameAndCorrectedAfterDataClassif[col] = gameAndCorrectedAfterDataClassif[col].apply(floorCheckpoints)
gameAndCorrectedAfterDataClassif["completionTime"] = gameAndCorrectedAfterDataClassif["completionTime"].apply(lambda x: min(x, 7200))

# floor thoroughness
gameAndCorrectedAfterDataClassif["thoroughness"]  = gameAndCorrectedAfterDataClassif["thoroughness"].apply(lambda x: min(x, 1000))

In [24]:
#Get the subset of subjects who have answered the before questionnaire
withBeforeData = allData.copy()[allData["scorebefore"] >= 0]
withBeforeData['anonymousID'] = withBeforeData.index
withBeforeData['anonymousID'] = withBeforeData['anonymousID'].apply(lambda x: x.strip('"'))
beforeCorrected = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 0,:]
#Join with questionnaire data with corrected answers
gameAndCorrectedBeforeDataClassif = pd.merge(withBeforeData, beforeCorrected, on="anonymousID")
gameAndCorrectedBeforeDataClassif['anonymousID'] = gameAndCorrectedBeforeDataClassif['anonymousID'].apply(lambda x: x.strip('"'))
gameAndCorrectedBeforeDataClassif.drop(["scoreafter", "scoreundefined", "temporality"], axis=1, inplace = True)

# If checkpoint not reached, set time to 3600 (1h)
for col in gameAndCorrectedBeforeDataClassif.columns.values.tolist():
    if isinstance(col, np.int64):
        gameAndCorrectedBeforeDataClassif[col] = gameAndCorrectedBeforeDataClassif[col].apply(floorCheckpoints)
gameAndCorrectedBeforeDataClassif["completionTime"] = gameAndCorrectedBeforeDataClassif["completionTime"].apply(lambda x: min(x, 7200))

# floor thoroughness
gameAndCorrectedBeforeDataClassif["thoroughness"]  = gameAndCorrectedBeforeDataClassif["thoroughness"].apply(lambda x: min(x, 1000))

In [ ]:


In [ ]:


In [ ]: