In [1]:

    
%run "../Functions/4. User comparison.ipynb"









    



rmdf152 read_csv success






    



../Utilities/Preparation.ipynb:2: ParserWarning: Both a converter and dtype were specified for column customData.localplayerguid - only the converter will be used
  "cells": [






    



gformEN read_csv success
gformFR read_csv success
temporalities set

Prepare data

Prepare Google form data



In [2]:

    
# Rename columns of the Google Forms table with tags independantly of form language
columnTags = ["timestamp", "gameInterest", "gameFrequency", "age", "gender", "biologyStudy", "biologyInterest", "synthBioKnowledge", "biobrickKnowledge", "previousVersion", "previousPlay", "arcadePlay", "androidPlay", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "Q12", "Q13", "Q14", "Q15", "Q16", "Q17", "Q18", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "Q27", "comments", "anonymousID", "lang", "temporality"]
columnQuestions = gform.columns.values.tolist()
googleData = gform.rename(columns=dict(zip(columnQuestions, columnTags)))
#googleData.head()



In [3]:

    
# Replaces answers to scientific questions in the questionnaires by their values (True or False)
correctedData = googleData.copy()
for rowId in range(correctedData.shape[0]):
    # Get the correction for each subject
    playerId = correctedData.loc[rowId, "anonymousID"]
    correction = getCorrections(playerId)
    if correction.shape[1] > 0:
        # If subject has answered questionnaire
        correction = correction.rename(index=dict(zip(columnQuestions, columnTags)))
        # Replace scientific answers by their correction
        for questionId in range(27):
            questionTag = "Q" + str(questionId + 1)
            correctedData.loc[rowId, questionTag] = int(correction.loc[questionTag, "corrections" + str(rowId)])
    
#correctedData.tail(15)



In [4]:

    
list(zip(columnQuestions, columnTags))









    Out[4]:





[('Timestamp', 'timestamp'),
 ('Are you interested in video games?', 'gameInterest'),
 ('Do you play video games?', 'gameFrequency'),
 ('How old are you?', 'age'),
 ('What is your gender?', 'gender'),
 ('How long have you studied biology?', 'biologyStudy'),
 ('Are you interested in biology?', 'biologyInterest'),
 ('Before playing Hero.Coli, had you ever heard about synthetic biology?',
  'synthBioKnowledge'),
 ('Before playing Hero.Coli, had you ever heard about BioBricks?',
  'biobrickKnowledge'),
 ('Have you ever played an older version of Hero.Coli before?',
  'previousVersion'),
 ('Have you played the current version of Hero.Coli?', 'previousPlay'),
 ('Have you played the arcade cabinet version of Hero.Coli?', 'arcadePlay'),
 ('Have you played the Android version of Hero.Coli?', 'androidPlay'),
 ('In order to modify the abilities of the bacterium, you have to...', 'Q1'),
 ('What are BioBricks and devices?', 'Q2'),
 ('What is the name of this BioBrick? TER', 'Q3'),
 ('What is the name of this BioBrick? PR', 'Q4'),
 ('What is the name of this BioBrick? CDS', 'Q5'),
 ('What is the name of this BioBrick? RBS', 'Q6'),
 ('What does this BioBrick do? TER', 'Q7'),
 ('What does this BioBrick do? PR', 'Q8'),
 ('What does this BioBrick do? CDS', 'Q9'),
 ('What does this BioBrick do? RBS', 'Q10'),
 ('Pick the case where the BioBricks are well-ordered:', 'Q11'),
 ('When does green fluorescence happen?', 'Q12'),
 ('What happens when you unequip the movement device?', 'Q13'),
 ('What is this? PLASMID', 'Q14'),
 ('What does this device do? PCONS:RBS:GFP:TER', 'Q15'),
 ('What does this device do? PCONS:RBS:FLHDC:TER', 'Q16'),
 ('What does this device do? PCONS:RBS:AMPR:TER', 'Q17'),
 ('What does this device do? PBAD:RBS:GFP:TER', 'Q18'),
 ('What does this device do? PCONS:RBS:GFP:TER 2', 'Q19'),
 ('What does this device do? PCONS:RBS:FLHDC:TER 2', 'Q20'),
 ('What does this device do? PCONS:RBS:AMPR:TER 2', 'Q21'),
 ('What does this device do? PBAD:RBS:GFP:TER 2', 'Q22'),
 ('Guess: what would a device producing l-arabinose do, if it started with a l-arabinose-induced promoter?',
  'Q23'),
 ('Guess: the bacterium would glow yellow...', 'Q24'),
 ('What is the species of the bacterium of the game?', 'Q25'),
 ('What is the scientific name of the tails of the bacterium?', 'Q26'),
 ('Find the antibiotic:', 'Q27'),
 ('You can write down remarks here.', 'comments'),
 ('Do not edit -  pre-filled anonymous ID', 'anonymousID'),
 ('Language', 'lang'),
 ('Temporality', 'temporality')]

For association rule mining



In [5]:

    
# Get only answers to scientific questions
correctedScientific = correctedData.loc[:, "Q1":"Q27"]
#correctedScientific.head()

For clustering

With full answers



In [6]:

    
# Remove timestamp and comments features
codedData = googleData.copy().drop(['timestamp', "comments"], axis=1)
codedData.head()









    Out[6]:







  
    
      
      gameInterest
      gameFrequency
      age
      gender
      biologyStudy
      biologyInterest
      synthBioKnowledge
      biobrickKnowledge
      previousVersion
      previousPlay
      ...
      Q21
      Q22
      Q23
      Q24
      Q25
      Q26
      Q27
      anonymousID
      lang
      temporality
    
  
  
    
      0
      Extremely
      Extremely
      23
      Female
      Until bachelor's degree
      Extremely
      Yes
      NaN
      No
      Yes
      ...
      It generates antibiotic resistance
      It generates green fluorescence in presence of...
      After being induced, it would produce more and...
      I don't know
      E. Coli
      Flagella
      Ampicillin
      8d352896-a3f1-471c-8439-0f426df901c1
      en
      before
    
    
      1
      Moderately
      Moderately
      28
      Other
      Until the end of high school
      Moderately
      Yes
      NaN
      No
      Yes
      ...
      It generates antibiotic resistance
      It generates green fluorescence in presence of...
      After being induced, it would produce more and...
      I don't know
      E. Coli
      Flagella
      Ampicillin
      7037c5b2-c286-498e-9784-9a061c778609
      en
      after
    
    
      2
      A lot
      Moderately
      20
      Female
      Until bachelor's degree
      Moderately
      No
      NaN
      No
      Yes
      ...
      It generates antibiotic resistance
      It generates green fluorescence
      I don't know
      If it produces BFP under purple light
      E. Coli
      Flagella
      Ampicillin
      5c4939b5-425b-4d19-b5d2-0384a515539e
      en
      after
    
    
      3
      Moderately
      Moderately
      21
      Male
      Until bachelor's degree
      A lot
      Yes
      NaN
      No
      Yes
      ...
      It generates antibiotic resistance
      It generates green fluorescence in presence of...
      After being induced, it would produce more and...
      If it produces YFP under cyan light
      E. Coli
      Flagella
      Ampicillin
      acb9c989-b4a6-4c4d-81cc-6b5783ec71d8
      en
      before
    
    
      4
      Moderately
      Rarely
      18
      Female
      Until bachelor's degree
      A lot
      No
      No
      No
      No
      ...
      I don't know
      I don't know
      I don't know
      If it produced YFP under cyan light
      I don't know
      I don't know
      Ampicillin
      1a03bc9e-bed4-4ddd-be7f-af23b1d5eb65
      en
      before
    
  

5 rows × 42 columns



In [7]:

    
# Code answers with integers when possible

# Define equivalences
# gameInterest
gameInterestCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Slightly": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
# gameFrequency
gameFrequencyCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Rarely": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
# biologyStudy
biologyStudyCoding = {"Not even in middle school": 0, "Jamais": 0, "Jamais, pas même au collège": 0, "Until the end of middle school": 1, "Jusqu'au brevet": 1, "Until the end of high school": 2, "Jusqu'au bac": 2, "Until bachelor's degree": 3, "Jusqu'à la license": 3, "At least until master's degree": 4, "Au moins jusqu'au master": 4, "I don't know": 0, "Je ne sais pas": 0}
# biologyInterest
biologyInterestCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Slightly": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}
# synthBioKnowledge
# biobrickKnowledge
previousKnowledgeCoding = {"Yes": 1, "No": 0, "I don't know": 0, "Oui": 1, 'Non': 0, "Je ne sais pas": 0}
# previousVersion
# previousPlay
# arcadePlay
# androidPlay
previousPlayCoding = {"Multiple times": 3, "A few times": 2, "Once": 1, "Yes": 1, "No": 0, "I don't know": 0, "De nombreuses fois": 3, "Quelques fois": 2, "Une fois": 1, "Oui": 1, "Non": 0, "Je ne sais pas": 0}
# lang
languageCoding = {"en": 0, "fr": 1}
# temporality
temporalityCoding = {"before": 0, "after": 1, "undefined": -5}

# Fill NaN cells
codedData["biobrickKnowledge"].fillna("I don't know", inplace = True)
codedData["arcadePlay"].fillna("I don't know", inplace = True)
codedData["androidPlay"].fillna("I don't know", inplace = True)
codedData["previousPlay"].fillna("I don't know", inplace = True)

# Replace by code
for rowId in range(codedData.shape[0]):
    codedData.loc[rowId, "gameInterest"] = gameInterestCoding[codedData.loc[rowId, "gameInterest"]]
    codedData.loc[rowId, "gameFrequency"] = gameFrequencyCoding[codedData.loc[rowId, "gameFrequency"]]
    codedData.loc[rowId, "biologyStudy"] = biologyStudyCoding[codedData.loc[rowId, "biologyStudy"]]
    codedData.loc[rowId, "biologyInterest"] = biologyInterestCoding[codedData.loc[rowId, "biologyInterest"]]
    codedData.loc[rowId, "synthBioKnowledge"] = previousKnowledgeCoding[codedData.loc[rowId, "synthBioKnowledge"]]
    codedData.loc[rowId, "biobrickKnowledge"] = previousKnowledgeCoding[codedData.loc[rowId, "biobrickKnowledge"]]
    codedData.loc[rowId, "previousVersion"] = previousPlayCoding[codedData.loc[rowId, "previousVersion"]]
    codedData.loc[rowId, "previousPlay"] = previousPlayCoding[codedData.loc[rowId, "previousPlay"]]
    codedData.loc[rowId, "arcadePlay"] = previousPlayCoding[codedData.loc[rowId, "arcadePlay"]]
    codedData.loc[rowId, "androidPlay"] = previousPlayCoding[codedData.loc[rowId, "androidPlay"]]
    codedData.loc[rowId, "lang"] = languageCoding[codedData.loc[rowId, "lang"]]
    codedData.loc[rowId, "temporality"] = temporalityCoding[codedData.loc[rowId, "temporality"]]



In [8]:

    
# One-Hot version
codedData = pd.get_dummies(codedData, prefix = ["gender", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "Q12", "Q13", "Q14", "Q15", "Q16", "Q17", "Q18", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "Q27"], columns = ["gender", "Q1", "Q2", "Q3", "Q4", "Q5", "Q6", "Q7", "Q8", "Q9", "Q10", "Q11", "Q12", "Q13", "Q14", "Q15", "Q16", "Q17", "Q18", "Q19", "Q20", "Q21", "Q22", "Q23", "Q24", "Q25", "Q26", "Q27"])
#codedData.head()



In [9]:

    
# Split the forms according to temporality
beforeForms = codedData.copy().loc[codedData["temporality"] == 0,:]
afterForms = codedData.copy().loc[codedData["temporality"] == 1,:]
undefForms = codedData.copy().loc[codedData["temporality"] == -5,:]
defForms = codedData.copy().loc[codedData["temporality"] >= 0,:] # Either before or after
# For subjects with both before and after forms, join the two
beforeAndAfterForms = pd.merge(beforeForms, afterForms, on="anonymousID", suffixes=('_before', '_after'))
#beforeForms.head()
#afterForms.head()
#undefForms.head()
#defForms.head()
#beforeAndAfterForms.head()



In [10]:

    
# Remove ID feature
allForms = codedData.copy().drop("anonymousID", axis=1)
beforeForms.drop("anonymousID", axis=1, inplace = True)
afterForms.drop("anonymousID", axis=1, inplace = True)
undefForms.drop("anonymousID", axis=1, inplace = True)
defForms.drop("anonymousID", axis=1, inplace = True)
beforeAndAfterForms.drop("anonymousID", axis=1, inplace = True)

With corrected scientific answers



In [11]:

    
# Remove timestamp and comments features
codedCorrectedData = correctedData.copy().drop(['timestamp', "comments"], axis=1)



In [12]:

    
# Fill NaN cells
codedCorrectedData["biobrickKnowledge"].fillna("I don't know", inplace = True)
codedCorrectedData["arcadePlay"].fillna("I don't know", inplace = True)
codedCorrectedData["androidPlay"].fillna("I don't know", inplace = True)
codedCorrectedData["previousPlay"].fillna("I don't know", inplace = True)

# Replace by code
for rowId in range(codedData.shape[0]):
    codedCorrectedData.loc[rowId, "gameInterest"] = gameInterestCoding[codedCorrectedData.loc[rowId, "gameInterest"]]
    codedCorrectedData.loc[rowId, "gameFrequency"] = gameFrequencyCoding[codedCorrectedData.loc[rowId, "gameFrequency"]]
    codedCorrectedData.loc[rowId, "biologyStudy"] = biologyStudyCoding[codedCorrectedData.loc[rowId, "biologyStudy"]]
    codedCorrectedData.loc[rowId, "biologyInterest"] = biologyInterestCoding[codedCorrectedData.loc[rowId, "biologyInterest"]]
    codedCorrectedData.loc[rowId, "synthBioKnowledge"] = previousKnowledgeCoding[codedCorrectedData.loc[rowId, "synthBioKnowledge"]]
    codedCorrectedData.loc[rowId, "biobrickKnowledge"] = previousKnowledgeCoding[codedCorrectedData.loc[rowId, "biobrickKnowledge"]]
    codedCorrectedData.loc[rowId, "previousVersion"] = previousPlayCoding[codedCorrectedData.loc[rowId, "previousVersion"]]
    codedCorrectedData.loc[rowId, "previousPlay"] = previousPlayCoding[codedCorrectedData.loc[rowId, "previousPlay"]]
    codedCorrectedData.loc[rowId, "arcadePlay"] = previousPlayCoding[codedCorrectedData.loc[rowId, "arcadePlay"]]
    codedCorrectedData.loc[rowId, "androidPlay"] = previousPlayCoding[codedCorrectedData.loc[rowId, "androidPlay"]]
    codedCorrectedData.loc[rowId, "lang"] = languageCoding[codedCorrectedData.loc[rowId, "lang"]]
    codedCorrectedData.loc[rowId, "temporality"] = temporalityCoding[codedCorrectedData.loc[rowId, "temporality"]]



In [13]:

    
# One-Hot versions of the above dataframes
codedCorrectedData = pd.get_dummies(codedCorrectedData, prefix = ["gender"], columns = ["gender"])
codedCorrectedData.head()









    Out[13]:







  
    
      
      gameInterest
      gameFrequency
      age
      biologyStudy
      biologyInterest
      synthBioKnowledge
      biobrickKnowledge
      previousVersion
      previousPlay
      arcadePlay
      ...
      Q25
      Q26
      Q27
      anonymousID
      lang
      temporality
      gender_Female
      gender_Male
      gender_Other
      gender_Prefer not to say
    
  
  
    
      0
      5
      5
      23
      3
      5
      1
      0
      0
      1
      0
      ...
      1
      1
      1
      8d352896-a3f1-471c-8439-0f426df901c1
      0
      0
      1
      0
      0
      0
    
    
      1
      3
      3
      28
      2
      3
      1
      0
      0
      1
      0
      ...
      1
      1
      1
      7037c5b2-c286-498e-9784-9a061c778609
      0
      1
      0
      0
      1
      0
    
    
      2
      4
      3
      20
      3
      3
      0
      0
      0
      1
      0
      ...
      1
      1
      1
      5c4939b5-425b-4d19-b5d2-0384a515539e
      0
      1
      1
      0
      0
      0
    
    
      3
      3
      3
      21
      3
      4
      1
      0
      0
      1
      0
      ...
      1
      1
      1
      acb9c989-b4a6-4c4d-81cc-6b5783ec71d8
      0
      0
      0
      1
      0
      0
    
    
      4
      3
      2
      18
      3
      4
      0
      0
      0
      0
      0
      ...
      0
      0
      1
      1a03bc9e-bed4-4ddd-be7f-af23b1d5eb65
      0
      0
      1
      0
      0
      0
    
  

5 rows × 45 columns



In [14]:

    
# Split the forms according to temporality
beforeCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 0,:]
afterCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 1,:]
undefCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == -5,:]
defCorrectedForms = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] >= 0,:] # Either before or after
# For subjects with both before and after forms, join the two
beforeAndAfterCorrectedForms = pd.merge(beforeCorrectedForms, afterCorrectedForms, on="anonymousID", suffixes=('_before', '_after'))
#beforeCorrectedForms.head()
#afterCorrectedForms.head()
#undefCorrectedForms.head()
#defCorrectedForms.head()
#beforeAndAfterCorrectedForms.head()



In [15]:

    
# Remove ID feature
allCorrectedForms = codedCorrectedData.copy().drop("anonymousID", axis=1)
beforeCorrectedForms.drop("anonymousID", axis=1, inplace = True)
afterCorrectedForms.drop("anonymousID", axis=1, inplace = True)
undefCorrectedForms.drop("anonymousID", axis=1, inplace = True)
defCorrectedForms.drop("anonymousID", axis=1, inplace = True)
beforeAndAfterCorrectedForms.drop("anonymousID", axis=1, inplace = True)

For Classification



In [16]:

    
# Use defForms and defCorrectedForms for coded data

RedMetrics

For clustering



In [17]:

    
# Fetch RedMetrics data for subjects which answered the gform
allData = getAllUserVectorData(getAllResponders(), _source=[])









    





 
 










    



C:\Users\Mikael\Anaconda3\envs\conda-env-python3-py\lib\site-packages\ipykernel_launcher.py:9: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  if __name__ == '__main__':



In [18]:

    
# Put subjects as rows and features as columns
allData = allData.transpose()
allData.head()









    Out[18]:







  
    
      
      sessionsCount
      scorebefore
      scoreafter
      scoreundefined
      complete
      configure
      craft
      death
      equip
      unequip
      ...
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
    
  
  
    
      8d352896-a3f1-471c-8439-0f426df901c1
      1.0
      18.0
      NaN
      NaN
      0.0
      0.0
      5.0
      5.0
      16.0
      1.0
      ...
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
    
    
      7037c5b2-c286-498e-9784-9a061c778609
      2.0
      NaN
      22.0
      NaN
      2.0
      0.0
      45.0
      25.0
      3.0
      4.0
      ...
      3.686890e+02
      1.333480e+02
      1.753140e+02
      1.830370e+02
      5.475110e+02
      1.582290e+02
      6.438000e+00
      2.750300e+01
      1.550830e+02
      1.422350e+02
    
    
      5c4939b5-425b-4d19-b5d2-0384a515539e
      1.0
      NaN
      15.0
      NaN
      0.0
      0.0
      7.0
      11.0
      25.0
      3.0
      ...
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
    
    
      acb9c989-b4a6-4c4d-81cc-6b5783ec71d8
      1.0
      23.0
      NaN
      NaN
      0.0
      1.0
      11.0
      11.0
      16.0
      3.0
      ...
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
      9.223372e+09
    
    
      1a03bc9e-bed4-4ddd-be7f-af23b1d5eb65
      1.0
      3.0
      NaN
      NaN
      0.0
      1.0
      10.0
      118.0
      133.0
      7.0
      ...
      6.856270e+02
      4.544890e+02
      1.379500e+02
      2.749810e+02
      5.320300e+01
      6.418300e+01
      3.634000e+00
      4.095000e+00
      9.223372e+09
      9.223372e+09
    
  

5 rows × 40 columns



In [19]:

    
# Drop useless columns in which all rows have the same value
allData.drop(["switch", "gotomooc"], axis=1, inplace = True)
# Fill NaN with negatives
allData.fillna(-1, inplace = True)



In [20]:

    
# Get the subset of subjects who have answered before and after
fullProcessData = allData[allData["scorebefore"] >= 0]
fullProcessData = fullProcessData[fullProcessData["scoreafter"] >= 0]
#fullProcessData.loc["01e85778-2903-447b-bbab-dd750564ee2d",:]

Combined with questionnaire answers



In [21]:

    
#Get the subset of subjects who have answered the after questionnaire
withAfterData = allData.copy()[allData["scoreafter"] >= 0]
withAfterData['anonymousID'] = withAfterData.index
withAfterData['anonymousID'] = withAfterData['anonymousID'].apply(lambda x: x.strip('"'))
# Join with questionnaire data with complete answers
afterNotCorrected = codedData.copy().loc[codedData["temporality"] == 1,:]
gameAndAfterData = pd.merge(withAfterData, afterNotCorrected, on="anonymousID")
gameAndAfterData.drop("anonymousID", axis=1, inplace = True)
#Join with questionnaire data with corrected answers
afterCorrected = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 1,:]
gameAndCorrectedAfterData = pd.merge(withAfterData, afterCorrected, on="anonymousID")
gameAndCorrectedAfterData.drop("anonymousID", axis=1, inplace = True)
#gameAndAfterData.head()
#gameAndCorrectedAfterData.head()

For classification



In [22]:

    
allDataClassif = allData.copy()
allDataClassif['anonymousID'] = allData.index
allDataClassif['anonymousID'] = allDataClassif['anonymousID'].apply(lambda x: x.strip('"'))

# If checkpoint not reached, set time to 3600 (1h)
def floorCheckpoints(value):
    if value > 3600:
        return 3600
    return value
for col in allDataClassif.columns.values.tolist():
    if isinstance(col, np.int64):
        allDataClassif[col] = allDataClassif[col].apply(floorCheckpoints)
allDataClassif["completionTime"] = allDataClassif["completionTime"].apply(lambda x: min(x, 7200))        

# floor thoroughness
allDataClassif["thoroughness"]  = allDataClassif["thoroughness"].apply(lambda x: min(x, 1000))

Combined with questionnaire answers



In [23]:

    
#Join with questionnaire data with corrected answers
gameAndCorrectedAfterDataClassif = pd.merge(withAfterData, afterCorrected, on="anonymousID")
gameAndCorrectedAfterDataClassif['anonymousID'] = gameAndCorrectedAfterDataClassif['anonymousID'].apply(lambda x: x.strip('"'))
gameAndCorrectedAfterDataClassif.drop(["scorebefore", "scoreundefined", "temporality"], axis=1, inplace = True)

for col in gameAndCorrectedAfterDataClassif.columns.values.tolist():
    if isinstance(col, np.int64):
        gameAndCorrectedAfterDataClassif[col] = gameAndCorrectedAfterDataClassif[col].apply(floorCheckpoints)
gameAndCorrectedAfterDataClassif["completionTime"] = gameAndCorrectedAfterDataClassif["completionTime"].apply(lambda x: min(x, 7200))

# floor thoroughness
gameAndCorrectedAfterDataClassif["thoroughness"]  = gameAndCorrectedAfterDataClassif["thoroughness"].apply(lambda x: min(x, 1000))



In [24]:

    
#Get the subset of subjects who have answered the before questionnaire
withBeforeData = allData.copy()[allData["scorebefore"] >= 0]
withBeforeData['anonymousID'] = withBeforeData.index
withBeforeData['anonymousID'] = withBeforeData['anonymousID'].apply(lambda x: x.strip('"'))
beforeCorrected = codedCorrectedData.copy().loc[codedCorrectedData["temporality"] == 0,:]
#Join with questionnaire data with corrected answers
gameAndCorrectedBeforeDataClassif = pd.merge(withBeforeData, beforeCorrected, on="anonymousID")
gameAndCorrectedBeforeDataClassif['anonymousID'] = gameAndCorrectedBeforeDataClassif['anonymousID'].apply(lambda x: x.strip('"'))
gameAndCorrectedBeforeDataClassif.drop(["scoreafter", "scoreundefined", "temporality"], axis=1, inplace = True)

# If checkpoint not reached, set time to 3600 (1h)
for col in gameAndCorrectedBeforeDataClassif.columns.values.tolist():
    if isinstance(col, np.int64):
        gameAndCorrectedBeforeDataClassif[col] = gameAndCorrectedBeforeDataClassif[col].apply(floorCheckpoints)
gameAndCorrectedBeforeDataClassif["completionTime"] = gameAndCorrectedBeforeDataClassif["completionTime"].apply(lambda x: min(x, 7200))

# floor thoroughness
gameAndCorrectedBeforeDataClassif["thoroughness"]  = gameAndCorrectedBeforeDataClassif["thoroughness"].apply(lambda x: min(x, 1000))



In [ ]:



In [ ]:



In [ ]:

	gameInterest	gameFrequency	age	gender	biologyStudy	biologyInterest	synthBioKnowledge	biobrickKnowledge	previousVersion	previousPlay	...	Q21	Q22	Q23	Q24	Q25	Q26	Q27	anonymousID	lang	temporality
0	Extremely	Extremely	23	Female	Until bachelor's degree	Extremely	Yes	NaN	No	Yes	...	It generates antibiotic resistance	It generates green fluorescence in presence of...	After being induced, it would produce more and...	I don't know	E. Coli	Flagella	Ampicillin	8d352896-a3f1-471c-8439-0f426df901c1	en	before
1	Moderately	Moderately	28	Other	Until the end of high school	Moderately	Yes	NaN	No	Yes	...	It generates antibiotic resistance	It generates green fluorescence in presence of...	After being induced, it would produce more and...	I don't know	E. Coli	Flagella	Ampicillin	7037c5b2-c286-498e-9784-9a061c778609	en	after
2	A lot	Moderately	20	Female	Until bachelor's degree	Moderately	No	NaN	No	Yes	...	It generates antibiotic resistance	It generates green fluorescence	I don't know	If it produces BFP under purple light	E. Coli	Flagella	Ampicillin	5c4939b5-425b-4d19-b5d2-0384a515539e	en	after
3	Moderately	Moderately	21	Male	Until bachelor's degree	A lot	Yes	NaN	No	Yes	...	It generates antibiotic resistance	It generates green fluorescence in presence of...	After being induced, it would produce more and...	If it produces YFP under cyan light	E. Coli	Flagella	Ampicillin	acb9c989-b4a6-4c4d-81cc-6b5783ec71d8	en	before
4	Moderately	Rarely	18	Female	Until bachelor's degree	A lot	No	No	No	No	...	I don't know	I don't know	I don't know	If it produced YFP under cyan light	I don't know	I don't know	Ampicillin	1a03bc9e-bed4-4ddd-be7f-af23b1d5eb65	en	before

	sessionsCount	scorebefore	scoreafter	scoreundefined	complete	configure	craft	death	equip	unequip	...	5	6	7	8	9	10	11	12	13	14
8d352896-a3f1-471c-8439-0f426df901c1	1.0	18.0	NaN	NaN	0.0	0.0	5.0	5.0	16.0	1.0	...	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09
7037c5b2-c286-498e-9784-9a061c778609	2.0	NaN	22.0	NaN	2.0	0.0	45.0	25.0	3.0	4.0	...	3.686890e+02	1.333480e+02	1.753140e+02	1.830370e+02	5.475110e+02	1.582290e+02	6.438000e+00	2.750300e+01	1.550830e+02	1.422350e+02
5c4939b5-425b-4d19-b5d2-0384a515539e	1.0	NaN	15.0	NaN	0.0	0.0	7.0	11.0	25.0	3.0	...	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09
acb9c989-b4a6-4c4d-81cc-6b5783ec71d8	1.0	23.0	NaN	NaN	0.0	1.0	11.0	11.0	16.0	3.0	...	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09	9.223372e+09
1a03bc9e-bed4-4ddd-be7f-af23b1d5eb65	1.0	3.0	NaN	NaN	0.0	1.0	10.0	118.0	133.0	7.0	...	6.856270e+02	4.544890e+02	1.379500e+02	2.749810e+02	5.320300e+01	6.418300e+01	3.634000e+00	4.095000e+00	9.223372e+09	9.223372e+09