2. Google form analysis

Analysis of results extracted from Google forms in csv format.

general purpose
sessions and temporalities
score
visualizations
sample getters
checkpoint validation
p(answered question N | answered question P)
Filtering users

Preparation



In [ ]:

    
%run "../Functions/1. Game sessions.ipynb"
print("2. Google form analysis")

Constants



In [ ]:

    
# special user ids
# 1.52
userIdThatDidNotAnswer = '001c95c6-8207-43dc-a51b-adf0c6e005d7'

userId1AnswerEN = '00dbbdca-d86c-4bc9-803c-0602e0153f68'
userIdAnswersEN = '5977184a-1be2-4725-9b48-f2782dc03efb'
userId1ScoreEN = '6b5d392d-b737-49ef-99af-e8c445ff6379'
userIdScoresEN = '5ecf601d-4eac-433e-8056-3a5b9eda0555'

userId1AnswerFR = '2734a37d-4ba5-454f-bf85-1f7b767138f6'
userIdAnswersFR = '01e85778-2903-447b-bbab-dd750564ee2d'
userId1ScoreFR = '3d733347-0313-441a-b77c-3e4046042a53'
userIdScoresFR = '58d22690-8604-41cf-a5b7-d71fb3b9ad5b'

userIdAnswersENFR = 'a7936587-8b71-43b6-9c61-17b2c2b55de3'

# 1.52.2
userIdThatDidNotAnswer = '6919aa9a-f18e-4fc5-8435-c26b869ba571'
userIdAnswersFR = '0135e29b-678d-4188-a935-1d0bfec9450b'
userIdScoresFR = '0135e29b-678d-4188-a935-1d0bfec9450b'
userId1AnswerFR = '01cc303e-d7c1-4c84-8e17-182b410da343'
userId1ScoreFR = '01cc303e-d7c1-4c84-8e17-182b410da343'
userId1AnswerEN = '027fb5ca-c40a-4977-852a-e448538061f2'
userId1ScoreEN = '027fb5ca-c40a-4977-852a-e448538061f2'
userIdAnswersEN = '1e94b693-df8f-4ad0-9f02-4aac6929bdaa'
userIdScoresEN = '1e94b693-df8f-4ad0-9f02-4aac6929bdaa'
userIdAnswersENFR = '2ad10897-b143-45f4-9a78-60ee4bcecc80'



In [ ]:

    
#localplayerguidkey = 'Ne pas modifier - identifiant anonyme prérempli'
localplayerguidkey = 'userId'
localplayerguidindex = gform.columns.get_loc(localplayerguidkey)
localplayerguidindex



In [ ]:

    
firstEvaluationQuestionKey = QGenotypePhenotype
firstEvaluationQuestionIndex = gform.columns.get_loc(firstEvaluationQuestionKey)
firstEvaluationQuestionIndex



In [ ]:

    
answersColumnNameStem = "answers"
correctionsColumnNameStem = "corrections"

Functions

general purpose



In [ ]:

    
def getUniqueUserCount(gfDF):
    return gfDF[localplayerguidkey].nunique()



In [ ]:

    
def getAllResponders( _gfDF ):
    userIds = _gfDF[localplayerguidkey].unique()
    return userIds

def getRandomGFormGUID():
    _uniqueUsers = getAllResponders()
    _userCount = len(_uniqueUsers)
    _guid = '0'
    while (not isGUIDFormat(_guid)):
        _userIndex = randint(0,_userCount-1)
        _guid = _uniqueUsers[_userIndex]
    return _guid

def hasAnswered( userId, _gfDF ):
    return userId in _gfDF[localplayerguidkey].values

def getAnswers( userId, _gfDF ):
    answers = _gfDF[_gfDF[localplayerguidkey]==userId]
    _columnAnswers = answers.T
    
    if 0 != len(answers):
        _newColumns = []
        for column in _columnAnswers.columns:
            _newColumns.append(answersColumnNameStem + str(column))
        _columnAnswers.columns = _newColumns
    else:
        # user has never answered
        print("user " + str(userId) + " has never answered")
        
    return _columnAnswers

sessions and temporalities



In [ ]:

    
def resetTemporalities(_gfDF):
    _gfDF[QTemporality] = answerTemporalities[2]



In [ ]:

    
#gform[QPlayed].unique()



In [ ]:

    
# answers that show that this survey was a pretest
alreadyPlayedPretestAnswers = [
    'No / not yet', 
#    'I just played for the first time',
    'I played it some time ago', # certainly an older version of the game
#    'I played it multiple times recently',
#    'I played recently on an other computer', # has to fill in profile questions again
#    'I played it multiple times recently on this computer'
]

alreadyPlayedPosttestAnswers = [
#    'No / not yet', 
    'I just played for the first time',
#    'I played it some time ago',
    'I played it multiple times recently',
    'I played recently on an other computer',
    'I played it multiple times recently on this computer'
]

# based only on user answer
APlayedButProfileAgain = 'I played recently on an other computer'

def setAnswerTemporalitiesSimple(_gfDF):
    # check whether temporalities have already been set
    if(len(_gfDF[QTemporality].unique()) == 1):
        for _index in _gfDF.index:
            if _gfDF.loc[_index, QPlayed] in alreadyPlayedPretestAnswers:
                _gfDF.loc[_index,QTemporality] = answerTemporalities[0]
            else:
                _gfDF.loc[_index,QTemporality] = answerTemporalities[1]
        print("temporalities set (user answer method)")



In [ ]:

    
# based only on first meaningful game event
def setAnswerTemporalities(_gfDF):
    # check whether temporalities have already been set
    if(len(_gfDF[QTemporality].unique()) == 1):
        # format : key = _userId, value = [_firstEventDate, 0 or _gfDF.index of before, 0 or _gfDF.index of after]
        temporalities = {}

        for _index in _gfDF.index:
            _userId = _gfDF.loc[_index,localplayerguidkey]
            _firstEventDate, beforeIndex, afterIndex = [0,0,0]

            if _userId in temporalities:
                _firstEventDate, beforeIndex, afterIndex = temporalities[_userId]
            else:
                _firstEventDate = getFirstEventDate(_userId)

            temporality = getTemporality(_gfDF.loc[_index,QTimestamp],_firstEventDate)

            if temporality == answerTemporalities[0] and beforeIndex != 0 :
                if _gfDF.loc[_index,QTimestamp] > _gfDF.loc[beforeIndex,QTimestamp]:
                    _gfDF.loc[beforeIndex,QTemporality] = answerTemporalities[2]
                else:
                    temporality = answerTemporalities[2]
            elif temporality == answerTemporalities[1] and afterIndex != 0 :
                if _gfDF.loc[_index,QTimestamp] < _gfDF.loc[afterIndex,QTimestamp]:
                    _gfDF.loc[afterIndex,QTemporality] = answerTemporalities[2]
                else:
                    temporality = answerTemporalities[2]

            _gfDF.loc[_index,QTemporality] = temporality
            if temporality == answerTemporalities[0]:
                beforeIndex = _index
            elif temporality == answerTemporalities[1]:
                afterIndex = _index

            temporalities[_userId] = [_firstEventDate, beforeIndex, afterIndex]
        print("temporalities set (first event method)")



In [ ]:

    
# when did the user answer the questionnaire? 
# After gameEventDate, before gameEventDate, undefined?
# answerDate is assumed to be the gform Timestamp, UTC
# gameEventDate is assumed to be of type pandas._libs.tslib.Timestamp, UTC, from RedMetrics
def getTemporality( answerDate, gameEventDate ):
    result = answerTemporalities[2]
    if(gameEventDate != pd.Timestamp.max.tz_localize('utc')):
        if(answerDate <= gameEventDate):
            result = answerTemporalities[0]
        elif (answerDate > gameEventDate):
            result = answerTemporalities[1]
    return result



In [ ]:

    
# should be based on events on a 24h window
def setAnswerTemporalities2( _gfDF, _rmDF ):
    # check whether temporalities have already been set
    if(len(_gfDF[QTemporality].unique()) == 1):
        # format : key = _userId, value = [pretestBeforeRatio, posttestAfterRatio, 0 or pretestIndex, 0 or posttestIndex]
        temporalities = {}

        for _index in _gfDF.index:
            _userId = _gfDF.loc[_index,localplayerguidkey]
            pretestBeforeRatio, posttestAfterRatio, pretestIndex, posttestIndex = [1.0, 1.0, 0, 0]

            answerDate = _gfDF.loc[_index,QTimestamp]
            [eventsBeforeRatio, eventsAfterRatio] = getEventCountRatios(answerDate, _userId, _rmDF, _gfDF)
            
            if _userId in temporalities:
                pretestBeforeRatio, posttestAfterRatio, pretestIndex, posttestIndex = temporalities[_userId]
                
            if ((eventsBeforeRatio == eventsAfterRatio) and (0 != eventsBeforeRatio)):
                print("anomaly for userId=" + _userId + ": eventsBeforeRatio == eventsAfterRatio != 0")
                
            # update posttest if there are less events afterwards?
            # keep the oldest anyways?
            if (posttestIndex == 0) and (eventsBeforeRatio >= eventsAfterRatio) and (0 != eventsBeforeRatio):
            # improvement idea:
            #if (eventsBeforeRatio > eventsAfterRatio) :
            #    if (posttestIndex == 0) or (_gfDF.loc[posttestIndex,localplayerguidkey]):
            # if _gfDF.loc[_index,QTimestamp] > _gfDF.loc[beforeIndex,QTimestamp]:
            # if _gfDF.loc[_index,QTimestamp] < _gfDF.loc[afterIndex,QTimestamp]:
                posttestAfterRatio = eventsAfterRatio
                posttestIndex = _index
                _gfDF.loc[_index,QTemporality] = answerTemporalities[1]
            # update pretest if there are more events before?
            # keep the oldest anyways?
            elif (pretestIndex == 0) and (eventsBeforeRatio <= eventsAfterRatio) and (0 != eventsAfterRatio):
                pretestBeforeRatio = eventsBeforeRatio
                pretestIndex = _index
                _gfDF.loc[_index,QTemporality] = answerTemporalities[0]

            temporalities[_userId] = [pretestBeforeRatio, posttestAfterRatio, pretestIndex, posttestIndex]
        print("temporalities set (ratio method)")



In [ ]:

    
def getEventCountRatios(answerDate, userId, _rmDF, _gfDF):
    result = [0,0]
    allEvents = _rmDF[_rmDF['userId']==userId]
    allEventsCount = len(allEvents)
    if 0 != allEventsCount:
        eventsBeforeRatio = len(allEvents[allEvents['userTime'] < answerDate])/allEventsCount
        eventsAfterRatio = len(allEvents[allEvents['userTime'] > answerDate])/allEventsCount
        result = [eventsBeforeRatio, eventsAfterRatio]
    return result

score



In [ ]:

    
def getCorrections( _userId, _gfDF, _source = correctAnswers, _columnAnswers = [] ):
    if(len(_columnAnswers) == 0):
        _columnAnswers = getAnswers( _userId, _gfDF = _gfDF )

    if 0 != len(_columnAnswers.columns):

        _questionsCount = len(_columnAnswers.values)

        for _columnName in _columnAnswers.columns:
            if answersColumnNameStem in _columnName:
                _answerNumber = _columnName.replace(answersColumnNameStem,"")
                newCorrectionsColumnName = correctionsColumnNameStem + _answerNumber

                #_columnAnswers[newCorrectionsColumnName] = _columnAnswers[_columnName]
                _columnAnswers[newCorrectionsColumnName] = pd.Series(np.full(_questionsCount, np.nan))

                for question in _columnAnswers[_columnName].index:
                    _correctAnswers = _source.loc[question]
                    
                    if(len(_correctAnswers) > 0):
                        _columnAnswers.loc[question,newCorrectionsColumnName] = False
                        for _correctAnswer in _correctAnswers:
                            if str(_columnAnswers.loc[question,_columnName])\
                            .startswith(str(_correctAnswer)):
                                _columnAnswers.loc[question,newCorrectionsColumnName] = True
                                break
                        

    else:
        # user has never answered
        print("can't give correct answers")
    return _columnAnswers
    

# edits in-place
# _corrections must be a dataframe full of corrections as produced above
def getBinarizedCorrections( _corrections ):
    for _columnName in _corrections.columns:
        for _index in _corrections[_columnName].index:
            if(True==_corrections.loc[_index,_columnName]):
                _corrections.loc[_index,_columnName] = 1.0
            elif (False==_corrections.loc[_index,_columnName]):
                _corrections.loc[_index,_columnName] = 0.0
    return _corrections

# only for one line in the gform
def getBinarized(_gfDFRow, _source = correctAnswers):
    _notEmptyIndexes = []
    for _index in _source.index:
        if(len(_source.loc[_index]) > 0):
            _notEmptyIndexes.append(_index)

    _binarized = pd.Series(np.full(len(_gfDFRow.index), np.nan), index = _gfDFRow.index)

    for question in _gfDFRow.index:
        _correctAnswers = _source.loc[question]

        if(len(_correctAnswers) > 0):
            _binarized[question] = 0
            for _correctAnswer in _correctAnswers:
                if str(_gfDFRow.loc[question])\
                .startswith(str(_correctAnswer)):
                    _binarized.loc[question] = 1
                    break

    _slicedBinarized = _binarized.loc[_notEmptyIndexes]
    return _slicedBinarized

def getAllBinarized(_gfDF, _source = correctAnswers):
    _notEmptyIndexes = []
    for _index in _source.index:
        if(len(_source.loc[_index]) > 0):
            _notEmptyIndexes.append(_index)

    _result = pd.DataFrame(index = _notEmptyIndexes)
    for _userId in getAllResponders(_gfDF = _gfDF):
        _corrections = getCorrections(_userId, _source=_source, _gfDF = _gfDF)
        _binarized = getBinarizedCorrections(_corrections)
        _slicedBinarized =\
    _binarized.loc[_notEmptyIndexes][_binarized.columns[\
    _binarized.columns.to_series().str.contains(correctionsColumnNameStem)\
                                       ]]

        _result = pd.concat([_result, _slicedBinarized], axis=1)

    _result = _result.T
        
    return _result
    

# CCA.iloc[i,j] is the number of users who correctly answered questions number i and j
# CCA[i,j] = Sum(A[u,i] * A[u,j], u in users) = Sum(tA[i,u] * A[u,j], u in users) = tA.A[i,j]
# CCA[i,j] is an int
def getCrossCorrectAnswers( _binarizedAnswers ):
    return _binarizedAnswers.T.dot(_binarizedAnswers)

#function that returns the score from user id
scoreLabel = 'score'
def getScore( _userId, _gfDF, _source = correctAnswers ):
    _score = pd.DataFrame({}, columns = answerTemporalities)
    _score.loc[scoreLabel,:] = np.nan
    for _column in _score.columns:
        _score.loc[scoreLabel, _column] = []

    if hasAnswered(_userId, _gfDF):
        _columnAnswers = getCorrections(_userId, _gfDF = _gfDF, _source = _source)
        for _columnName in _columnAnswers.columns:
            # only work on corrected columns
            if correctionsColumnNameStem in _columnName:
                _answerColumnName = _columnName.replace(correctionsColumnNameStem,\
                                                      answersColumnNameStem)
                _temporality = _columnAnswers.loc[QTemporality,_answerColumnName]

                _counts = (_columnAnswers[_columnName]).value_counts()
                _thisScore = 0
                if(True in _counts):
                    _thisScore = _counts[True]
                _score.loc[scoreLabel,_temporality].append(_thisScore)
    else:
        print("user " + str(_userId) + " has never answered")

    return _score


def getGFormRowCorrection(_gfDFRow, _source = correctAnswers):
    result = _gfDFRow.copy()

    if(len(_gfDFRow) == 0):
        print("this gform row is empty")

    else:
        result = pd.Series(index = _gfDFRow.index, data = np.full(len(_gfDFRow), np.nan))

        for question in result.index:
            _correctAnswers = _source.loc[question]

            if(len(_correctAnswers) > 0):
                result.loc[question] = False
                for _correctAnswer in _correctAnswers:
                    if str(_gfDFRow.loc[question]).startswith(str(_correctAnswer)):
                        result.loc[question] = True
                        break
    return result

def getGFormRowScore( _gfDFRow, _source = correctAnswers):
    correction = getGFormRowCorrection( _gfDFRow, _source = _source)
    _counts = correction.value_counts()
    _thisScore = 0
    if(True in _counts):
        _thisScore = _counts[True]
    return _thisScore



In [ ]:

    
QCuriosityCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Énormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Slightly": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}

QCuriosityBiologyCoding = QCuriosityCoding
QCuriositySyntheticBiologyCoding = QCuriosityCoding
QCuriosityVideoGamesCoding = QCuriosityCoding
QCuriosityEngineeringCoding = QCuriosityCoding

QPlayedCoding = {"I played it multiple times recently": 3, "I played it multiple times recently on this computer": 3, "I played recently on an other computer": 2, "I played it some time ago": 1, "I just played for the first time": 1, "No / not yet": 0, "I don't know": 0}
#QAgeCoding
QGenderCoding = {"Female": 1, "Other": 0, "Prefer not to say": 0, "Male": -1}
QInterestVideoGamesCoding = QCuriosityCoding
QInterestBiologyCoding = QCuriosityCoding
QStudiedBiologyCoding = {"Not even in middle school": 0, "Jamais": 0, "Jamais, pas même au collège": 0, "Until the end of middle school": 1, "Jusqu'au brevet": 1, "Until the end of high school": 2, "Jusqu'au bac": 2, "Until bachelor's degree": 3, "Jusqu'à la license": 3, "At least until master's degree": 4, "Au moins jusqu'au master": 4, "I don't know": 0, "Je ne sais pas": 0}
QPlayVideoGamesCoding = {"A lot": 4, "Beaucoup": 4, "Enormément": 5, "Énormément": 5, "Extremely": 5, "Moderately": 3, "Moyennement": 3, "Rarely": 2, "Un peu": 2, "I don't know": 3, "Je ne sais pas": 3, "Not at all": 1, "Pas du tout": 1}

QHeardSynBioOrBioBricksCoding = {"Yes, and I know what it means" : 2, "Yes, but I don't exactly know what it means": 1, "No": 0}
QVolunteerCoding = {"Yes": 1, "No": 0}
QEnjoyedCoding = {'Extremely': 4, 'A lot': 3, 'Not at all': 0, 'A bit': 1, 'Moderately': 2, "No": 0, "Not applicable: not played yet": -1}

QLanguageCoding = {"en": 0, "fr": 1}
QTemporalityCoding = {"pretest": 0, "posttest": 1, "undefined": -5}

numericDemographicQuestionsCodings = [
QCuriosityBiologyCoding,
QCuriositySyntheticBiologyCoding,
QCuriosityVideoGamesCoding,
QCuriosityEngineeringCoding,
QPlayedCoding,
QGenderCoding,
QInterestVideoGamesCoding,
QInterestBiologyCoding,
QStudiedBiologyCoding,
QPlayVideoGamesCoding,
QHeardSynBioOrBioBricksCoding,
QVolunteerCoding,
QEnjoyedCoding,
QLanguageCoding,
QTemporalityCoding,
]

numericDemographicQuestions = [
QCuriosityBiology,
QCuriositySyntheticBiology,
QCuriosityVideoGames,
QCuriosityEngineering,
QPlayed,
QGender,
QInterestVideoGames,
QInterestBiology,
QStudiedBiology,
QPlayVideoGames,
QHeardSynBioOrBioBricks,
QVolunteer,
QEnjoyed,
QLanguage,
QTemporality,
]

numericDemographicQuestionsCodingsSeries = pd.Series(data = numericDemographicQuestionsCodings, index = numericDemographicQuestions)



In [ ]:

    
# only for one line in the gform
def getNumeric(_gfDFRow, _source = correctAnswers):
    _notEmptyIndexes = []
    for _index in _source.index:
        if(len(_source.loc[_index]) > 0):
            _notEmptyIndexes.append(_index)

    _numeric = pd.Series(np.full(len(_gfDFRow.index), np.nan), index = _gfDFRow.index)

    for question in _gfDFRow.index:
        if question in scientificQuestions:
            _correctAnswers = _source.loc[question]
            if(len(_correctAnswers) > 0):
                _numeric[question] = 0
                for _correctAnswer in _correctAnswers:
                    if str(_gfDFRow.loc[question])\
                    .startswith(str(_correctAnswer)):
                        _numeric.loc[question] = 1
                        break
        elif question == QAge:
            if pd.notnull(_gfDFRow.loc[question]):
                _numeric.loc[question] = float(_gfDFRow.loc[question])
            else:
                _numeric.loc[question] = -1
        elif question in demographicQuestions:
            if pd.notnull(_gfDFRow.loc[question]):
                _numeric.loc[question] = numericDemographicQuestionsCodingsSeries.loc[question][_gfDFRow.loc[question]]
            else:
                _numeric.loc[question] = 0

    _slicedBinarized = _numeric.loc[_notEmptyIndexes]
    return _slicedBinarized

visualizations



In [ ]:

    
def createStatSet(series, ids = pd.Series()):
    if(0 == len(ids)):
        ids = series.index
    result = {
        'count' : len(ids),
        'unique' : len(ids.unique()),
        'median' : series.median(),
        'mean' : series.mean(),
        'std' : series.std(),
    }
    return result

# _binarized must be well-formed, similarly to getAllBinarized's output
def getPercentagePerQuestion(_binarized):
    totalPerQuestionDF = pd.DataFrame(data=np.dot(np.ones(_binarized.shape[0]), _binarized), index=_binarized.columns)
    percentagePerQuestion = totalPerQuestionDF*100 / _binarized.shape[0]
    return percentagePerQuestion



In [ ]:

    
## gfDF can be: all, those who answered both before and after,
## those who played between date1 and date2, ...
from scipy.stats import ttest_ind
def plotBasicStats(
    gfDF,
    title = np.nan,
    includeAll = False,
    includeBefore = True,
    includeAfter = True,
    includeUndefined = False,
    includeProgress = True,
    includeRelativeProgress = False,
    horizontalPlot = True,
    sortedAlong = '', # in ["pretest", "posttest", "progression"]
    figsize=(20,4),
    annot=True,
    cbar=True,
    annot_kws={"size": 10},
    font_scale=1,
):
    
    stepsPerInclude = 2
    includeCount = np.sum([includeAll, includeBefore, includeAfter, includeUndefined, includeProgress])
    stepsCount = stepsPerInclude*includeCount + 3
    
    #print("stepsPerInclude=" + str(stepsPerInclude))
    #print("includeCount=" + str(includeCount))
    #print("stepsCount=" + str(stepsCount))
    
    __progress = FloatProgress(min=0, max=stepsCount)
    display(__progress)
    
    gfDFPretests = gfDF[gfDF[QTemporality] == answerTemporalities[0]]
    gfDFPosttests = gfDF[gfDF[QTemporality] == answerTemporalities[1]]
    gfDFUndefined = gfDF[gfDF[QTemporality] == answerTemporalities[2]]

    #uniqueBefore = gfDFPretests[localplayerguidkey]
    #uniqueAfter = 
    #uniqueUndefined =

    scientificQuestionsSource = correctAnswers.copy()
    allQuestionsSource = correctAnswers + demographicAnswers
    
    categories = ['all', answerTemporalities[0], answerTemporalities[1], answerTemporalities[2],\
                  'progress', 'rel. progress']
    data = {}
    
    sciBinarized = pd.DataFrame()
    allBinarized = pd.DataFrame()
    scoresAll = pd.DataFrame()
    
    sciBinarizedBefore = pd.DataFrame()
    allBinarizedBefore = pd.DataFrame()
    scoresBefore = pd.DataFrame()
    
    sciBinarizedAfter = pd.DataFrame()
    allBinarizedAfter = pd.DataFrame()
    scoresAfter = pd.DataFrame()
    
    sciBinarizedUndefined = pd.DataFrame()
    allBinarizedUndefined = pd.DataFrame()
    scoresUndefined = pd.DataFrame()

    scoresProgress = pd.DataFrame()

    ## basic stats:
    ### mean score
    ### median score
    ### std
    if includeAll:
        sciBinarized = getAllBinarized(gfDF, _source = scientificQuestionsSource)
        __progress.value += 1
        allBinarized = getAllBinarized(gfDF, _source = allQuestionsSource)
        __progress.value += 1
        scoresAll = pd.Series(np.dot(sciBinarized, np.ones(sciBinarized.shape[1])))
        
        data[categories[0]] = createStatSet(scoresAll, gfDF[localplayerguidkey])
        
    if includeBefore or includeProgress:
        sciBinarizedBefore = getAllBinarized(gfDFPretests, _source = scientificQuestionsSource)
        __progress.value += 1
        allBinarizedBefore = getAllBinarized(gfDFPretests, _source = allQuestionsSource)
        __progress.value += 1
        scoresBefore = pd.Series(np.dot(sciBinarizedBefore, np.ones(sciBinarizedBefore.shape[1])))
        temporaryStatSetBefore = createStatSet(scoresBefore, gfDFPretests[localplayerguidkey])
    if includeBefore:
        data[categories[1]] = temporaryStatSetBefore
        
    if includeAfter or includeProgress:
        sciBinarizedAfter = getAllBinarized(gfDFPosttests, _source = scientificQuestionsSource)
        __progress.value += 1
        allBinarizedAfter = getAllBinarized(gfDFPosttests, _source = allQuestionsSource)
        __progress.value += 1
        scoresAfter = pd.Series(np.dot(sciBinarizedAfter, np.ones(sciBinarizedAfter.shape[1])))
        temporaryStatSetAfter = createStatSet(scoresAfter, gfDFPosttests[localplayerguidkey])
    if includeAfter:
        data[categories[2]] = temporaryStatSetAfter
        
    if includeUndefined:
        sciBinarizedUndefined = getAllBinarized(gfDFUndefined, _source = scientificQuestionsSource)
        __progress.value += 1
        allBinarizedUndefined = getAllBinarized(gfDFUndefined, _source = allQuestionsSource)
        __progress.value += 1
        scoresUndefined = pd.Series(np.dot(sciBinarizedUndefined, np.ones(sciBinarizedUndefined.shape[1])))
        
        data[categories[3]] = createStatSet(scoresUndefined, gfDFUndefined[localplayerguidkey])

    if includeProgress:
        data[categories[4]] = {
            'count' : min(temporaryStatSetAfter['count'], temporaryStatSetBefore['count']),
            'unique' : min(temporaryStatSetAfter['unique'], temporaryStatSetBefore['unique']),
            'median' : temporaryStatSetAfter['median']-temporaryStatSetBefore['median'],
            'mean' : temporaryStatSetAfter['mean']-temporaryStatSetBefore['mean'],
            'std' : temporaryStatSetAfter['std']-temporaryStatSetBefore['std'],
        }
        __progress.value += 2
    
    
    result = pd.DataFrame(data)
    __progress.value += 1

    print(title)
    print(result)
    if (includeBefore and includeAfter) or includeProgress:
        if (len(scoresBefore) > 2 and len(scoresAfter) > 2):
            ttest = ttest_ind(scoresBefore, scoresAfter)
            print("t test: statistic=" + repr(ttest.statistic) + " pvalue=" + repr(ttest.pvalue))
    print()

    ## percentage correct
    ### percentage correct - max 5 columns
    percentagePerQuestionAll = pd.DataFrame()
    percentagePerQuestionBefore = pd.DataFrame()
    percentagePerQuestionAfter = pd.DataFrame()
    percentagePerQuestionUndefined = pd.DataFrame()
    percentagePerQuestionProgress = pd.DataFrame()
    
    tables = []

    if includeAll:
        percentagePerQuestionAll = getPercentagePerQuestion(allBinarized)
        tables.append([percentagePerQuestionAll, categories[0]])
        
    if includeBefore or includeProgress:
        percentagePerQuestionBefore = getPercentagePerQuestion(allBinarizedBefore)
    if includeBefore:
        tables.append([percentagePerQuestionBefore, categories[1]])
        
    if includeAfter or includeProgress:
        percentagePerQuestionAfter = getPercentagePerQuestion(allBinarizedAfter)
    if includeAfter:
        tables.append([percentagePerQuestionAfter, categories[2]])
        
    if includeUndefined:
        percentagePerQuestionUndefined = getPercentagePerQuestion(allBinarizedUndefined)
        tables.append([percentagePerQuestionUndefined, categories[3]])
        
    if includeProgress or includeRelativeProgress:
        percentagePerQuestionProgress = percentagePerQuestionAfter - percentagePerQuestionBefore
        
        if includeProgress:
            tables.append([percentagePerQuestionProgress, categories[4]])
            
        if includeRelativeProgress:
            # use temporaryStatSetAfter['count'], temporaryStatSetBefore['count']?
            percentagePerQuestionProgress2 = percentagePerQuestionProgress.copy()
            for index in range(0,len(percentagePerQuestionProgress.index)):
                if (0 == percentagePerQuestionBefore.iloc[index,0]):
                    percentagePerQuestionProgress2.iloc[index,0] = 0
                else:
                    percentagePerQuestionProgress2.iloc[index,0] = \
                    percentagePerQuestionProgress.iloc[index,0]/percentagePerQuestionBefore.iloc[index,0]
            tables.append([percentagePerQuestionProgress2, categories[5]])
    
    __progress.value += 1

    graphTitle = '% correct: '
    toConcat = []
    
    for table,category in tables:
        concat = (len(table.values) > 0)
        for elt in table.iloc[:,0].values:
            if np.isnan(elt):
                concat = False
                break
        if(concat):
            graphTitle = graphTitle + category + ' '
            toConcat.append(table)

    if (len(toConcat) > 0):
        percentagePerQuestionConcatenated = pd.concat(
            toConcat
            , axis=1)

        if(pd.notnull(title) > 0):
            graphTitle = graphTitle + ' - ' + title

        _fig = plt.figure(figsize=figsize)
        _ax1 = plt.subplot(111)
        if pd.isnull(title):
            _ax1.set_title(graphTitle)
        else:
            _ax1.set_title(title)
        matrixToDisplay = percentagePerQuestionConcatenated.round().astype(int)
        
        matrixToDisplay.columns = ["pretest", "posttest", "progression"]
        if sortedAlong in matrixToDisplay.columns:
            demographicQuestions = demographicAnswers[demographicAnswers.apply(len) > 0].index
            sciSorted = matrixToDisplay.loc[scientificQuestions, :].sort_values(by = sortedAlong, ascending = True)
            demoSorted = matrixToDisplay.loc[demographicQuestions, :].sort_values(by = sortedAlong, ascending = True)
            matrixToDisplay = pd.concat([sciSorted, demoSorted])
            
        if horizontalPlot:
            matrixToDisplay = matrixToDisplay.T
            
        sns.set(font_scale=font_scale)
        sns.heatmap(
            matrixToDisplay,
            ax=_ax1,
            cmap=plt.cm.jet,
            square=True,
            annot=annot,
            fmt='d',
            vmin=0,
            vmax=100,
            cbar=cbar,
            annot_kws=annot_kws,
        )
        
        #if horizontalPlot:
        # both fail
        #heatmap.set_xticklabels(_ax1.get_xticklabels(),rotation=45)
        #plt.xticks(rotation=45)
        
    __progress.value += 1
    
    ### percentage cross correct
    ### percentage cross correct, conditionnally
    
    if(__progress.value != stepsCount):
        print("__progress.value=" + str(__progress.value) + " != stepsCount=" + str(stepsCount))
            
    __progress.close()
    del __progress

#    return sciBinarized, sciBinarizedBefore, sciBinarizedAfter, sciBinarizedUndefined, \
#           allBinarized, allBinarizedBefore, allBinarizedAfter, allBinarizedUndefined
    return matrixToDisplay



In [ ]:

    
def plotCorrelationMatrices(
    allBinarized = [],
    beforeBinarized = [],
    afterBinarized = [],
    undefinedBinarized = [],
    titleAll = 'Correlation of pre- & post-test answers',
    titleBefore = 'Correlation of pre-test answers',
    titleAfter = 'Correlation of post-test answers',
    titleUndefined = 'Correlation of undefined answers',
    titleSuffix = '',
):
    dataBinarized = [allBinarized, beforeBinarized, afterBinarized, undefinedBinarized]
    titles = [titleAll + titleSuffix, titleBefore + titleSuffix, titleAfter + titleSuffix, titleUndefined + titleSuffix]
    
    for index in range(0, len(dataBinarized)):
        if(len(dataBinarized[index]) > 0):
            plotCorrelationMatrix(
                dataBinarized[index],
                _abs=True,
                _clustered=False,
                _questionNumbers=True,
                _annot = True,
                _figsize = (20,20),
                _title=titles[index],
            )
    
##correlation
### simple heatmap
### clustermap
methods = ['pearson', 'kendall', 'spearman']
def plotCorrelationMatrix( 
    _binarizedMatrix, 
    _method = methods[0], 
    _title='Questions\' Correlations', 
    _abs=False, 
    _clustered=False, 
    _questionNumbers=False,
    _annot = False,
    _figsize = (10,10),
    _metric='euclidean'
):
    
    _progress = FloatProgress(min=0, max=7)
    display(_progress)
    
    _overlay = False

    _progress.value += 1
    
    # computation of correlation matrix
    _m = _method
    if(not (_method in methods)):
        _m = methods[0]
    _correlation = _binarizedMatrix.astype(float).corr(_m)
    _progress.value += 1
    if(_abs):
        _correlation = _correlation.abs()
    _progress.value += 1
    
    if(_clustered):
    # removing NaNs
    # can't cluster NaN lines in _correlation
        _notNaNsIndices = []
        _notNaNsColumns = []
        for index in _correlation.index:
            #if(pd.notnull(_correlation.loc[index,:]).all()): # if no element is nan
            if(~pd.isnull(_correlation.loc[index,:]).all()): # if at least one element is not nan
                _notNaNsIndices.append(index)
        #for column in _correlation.columns:
        #    if(~np.isnan(_correlation.loc[:,column]).all()):
        #        _notNaNsColumns.append(column)
        
        _binarizedMatrix = _binarizedMatrix.loc[:,_notNaNsIndices]
        _correlation = _correlation.loc[_notNaNsIndices,_notNaNsIndices]
    _progress.value += 1
        
        
    # optional computation of overlay
    if(_annot):
        _overlay = getCrossCorrectAnswers(_binarizedMatrix).astype(int)
    _progress.value += 1
    
    # preparation of plot labels
    if(_questionNumbers):
        _correlation.columns = pd.Series(_correlation.columns).apply(\
                lambda x: x + ' #' + str(_correlation.columns.get_loc(x) + 1))
        if(_clustered):
            _correlation.index = pd.Series(_correlation.columns).apply(\
                lambda x: '#' + str(_correlation.columns.get_loc(x) + 1) + ' ' + x)
        else:
            _correlation.index = _correlation.columns
    _progress.value += 1
    
    vmin = -1
    if _abs:
        vmin = 0
    vmax = 1
    
    # plot
    if(_clustered):
        result = sns.clustermap(
            _correlation,
            metric=_metric,
            cmap=plt.cm.jet,
            square=True,
            figsize=_figsize,
            annot=_overlay,
            fmt='d',
            vmin=vmin,
            vmax=vmax,
       )
        return result, _overlay
    
#        if(_annot):
            # reorder columns using clustergrid.dendrogram_col.reordered_ind

            #_overlay1 = _overlay.copy()            

#            reorderedCols = result.dendrogram_col.reordered_ind
#            _overlay = _overlay

            #_overlay2 = _overlay.copy().iloc[reorderedCols,reorderedCols]

#            result = sns.clustermap(_correlation,metric=_metric,cmap=plt.cm.jet,square=True,figsize=_figsize,annot=_overlay, fmt='d')
            
            #print(_overlay1.columns == _overlay2.columns)
            #print(_overlay1 == _overlay2)

            #print(_overlay1.columns)
            #print(_overlay1.columns)
            #print(_overlay1)
            #print(_overlay2)
            
            #return _overlay1, _overlay2
#            return result, _overlay
            
    else:
        _fig = plt.figure(figsize=_figsize)
        _ax = plt.subplot(111)
        _ax.set_title(_title)
        sns.heatmap(
            _correlation,
            ax=_ax,
            cmap=plt.cm.jet,
            square=True,
            annot=_overlay,
            fmt='d',
            vmin=vmin,
            vmax=vmax,
        )
            
    _progress.close()
    del _progress
    
#def plotAll():
    # loop on question types
    # loop on temporalities
    # loop on representations
    ## basic stats:
    ### mean score
    ### median score
    ### std
    ## percentage correct
    ### percentage correct - 3 columns
    ### percentage cross correct
    ### percentage cross correct, conditionnally
    ##correlation
    ### simple heatmap
#    plotCorrelationMatrix
    ### clustermap
#    plotCorrelationMatrix



In [ ]:

    
def plotSamples(gfDFs):
    _progress = FloatProgress(min=0, max=len(gfDFs))
    display(_progress)

    for gfDF, title in gfDFs:
        plotBasicStats(gfDF, title)
        _progress.value += 1

    if(_progress.value != len(gfDFs)):
        print("__progress.value=" + str(__progress.value) + " != len(gfDFs)=" + str(len(gfDFs)))
            
    _progress.close()
    del _progress



In [ ]:

    
# for per-gform, manual analysis
def getGFormDataPreview(_GFUserId, gfDF):
    gforms = gform[gform[localplayerguidkey] == _GFUserId]
    result = {}
    
    for _ilocIndex in range(0, len(gforms)):
        gformsIndex = gforms.index[_ilocIndex]
        currentGForm = gforms.iloc[_ilocIndex]

        subresult = {}
        subresult['date'] = currentGForm[QTimestamp]
        subresult['temporality RM'] = currentGForm[QTemporality]
        subresult['temporality GF'] = getGFormRowGFormTemporality(currentGForm)
        subresult['score'] = getGFormRowScore(currentGForm)
        subresult['genderAge'] = [currentGForm[QGender], currentGForm[QAge]]

        # search for other users with similar demographics
        matchingDemographics = getMatchingDemographics(gfDF, currentGForm)
        matchingDemographicsIds = []
        #print(type(matchingDemographics))
        #print(matchingDemographics.index)
        for matchesIndex in matchingDemographics.index:
            matchingDemographicsIds.append([matchesIndex, matchingDemographics.loc[matchesIndex, localplayerguidkey]])

        subresult['demographic matches'] = matchingDemographicsIds

        result['survey' + str(_ilocIndex)] = subresult

    return result

sample getters

set operators



In [ ]:

    
# indices do not need to be reset as they all come from gform
def getUnionQuestionnaires(gfDF1, gfDF2):
    if (not (gfDF1.columns == gfDF2.columns).all()):
        print("warning: parameter columns are not the same")
    return pd.concat([gfDF1, gfDF2]).drop_duplicates()



In [ ]:

    
# indices do not need to be reset as they all come from gform
def getIntersectionQuestionnaires(gfDF1, gfDF2):
    if (not (gfDF1.columns == gfDF2.columns).all()):
        print("warning: parameter columns are not the same")
    return pd.merge(gfDF1, gfDF2, how = 'inner').drop_duplicates()



In [ ]:

    
# get gfDF1 and gfDF2 rows where users are common to gfDF1 and gfDF2
def getIntersectionUsersSurveys(gfDF1, gfDF2):
    result1 = gfDF1[gfDF1[localplayerguidkey].isin(gfDF2[localplayerguidkey])]
    result2 = gfDF2[gfDF2[localplayerguidkey].isin(gfDF1[localplayerguidkey])]
    return getUnionQuestionnaires(result1,result2)

Users who answered either before or after



In [ ]:

    
gform[QPlayed].unique()



In [ ]:

    
def getRMBefores(gfDF):
    return gfDF[gfDF[QTemporality] == answerTemporalities[0]]



In [ ]:

    
def getRMAfters(gfDF):
    return gfDF[gfDF[QTemporality] == answerTemporalities[1]]



In [ ]:

    
# returns users who declared that they have never played the game, whatever platform
#  everPlayedPositives is defined in "../Functions/0.1 GF English localization.ipynb"
def getGFormBefores(gfDF):
    return gfDF[
      ~gfDF[QPlayed].isin(everPlayedPositives)
                ]



In [ ]:

    
def isGFormBefore(surveyAnswerIndex, _gform):
    return (len(getGFormBefores(_gform.loc[surveyAnswerIndex:surveyAnswerIndex, :])) == 1)



In [ ]:

    
# returns users who declared that they have already played the game, whatever platform
#  everPlayedPositives is defined in "../Functions/0.1 GF English localization.ipynb"
def getGFormAfters(gfDF):
    return gfDF[
      gfDF[QPlayed].isin(everPlayedPositives)
                ]



In [ ]:

    
def isGFormAfter(surveyAnswerIndex, _gform):
    return (len(getGFormAfters(_gform.loc[surveyAnswerIndex:surveyAnswerIndex, :])) == 1)



In [ ]:

    
# returns an element of answerTemporalities
#  everPlayedPositives is defined in '../Static data/English localization.ipynb'
def getGFormRowGFormTemporality(_gfDFRow):
    if (_gfDFRow[QPlayed] in everPlayedPositives):
        return answerTemporalities[1]
    else:
        return answerTemporalities[0]

Users who answered both before and after



In [ ]:

    
def getSurveysOfUsersWhoAnsweredBoth(gfDF, gfMode = True, rmMode = False):
    befores = gfDF
    afters = gfDF

    if gfMode:
        befores = getGFormBefores(befores)
        afters = getGFormAfters(afters)

    if rmMode:
        befores = getRMBefores(befores)
        afters = getRMAfters(afters)

    return getIntersectionUsersSurveys(befores, afters)



In [ ]:

    
def getSurveysThatAnswered(gfDF, questionsAndPositiveAnswers, hardPolicy = True):
    filterSeries = []
    if hardPolicy:
        filterSeries = pd.Series(True, gfDF.index)
        for question, positiveAnswers in questionsAndPositiveAnswers:
            filterSeries = filterSeries & (gfDF[question].isin(positiveAnswers))
    else:
        filterSeries = pd.Series(False, range(len(gfDF.index)))
        for question, positiveAnswers in questionsAndPositiveAnswers:
            filterSeries = filterSeries | (gfDF[question].isin(positiveAnswers))
    return gfDF[filterSeries]



In [ ]:

    
# surveys of people who have studied biology, and/or know about synthetic biology, and/or about BioBricks
def getSurveysOfBiologists(gfDF, hardPolicy = True):
    #QStudiedBiology biologyStudyPositives
    #irrelevant QInterestBiology biologyInterestPositives
    #QHeardSynBioOrBioBricks heardAboutBioBricksPositives

    questionsAndPositiveAnswers = [[QStudiedBiology, biologyStudyPositives],
                               [QHeardSynBioOrBioBricks, heardAboutBioBricksPositives]]
    
    return getSurveysThatAnswered(gfDF, questionsAndPositiveAnswers, hardPolicy)



In [ ]:

    
# surveys of people who play video games and/or are interested in them
def getSurveysOfGamers(gfDF, hardPolicy = True):
    #QInterestVideoGames interestPositives
    #QPlayVideoGames frequencyPositives

    questionsAndPositiveAnswers = [[QInterestVideoGames, interestPositives], [QPlayVideoGames, frequencyPositives]]
    
    return getSurveysThatAnswered(gfDF, questionsAndPositiveAnswers, hardPolicy)



In [ ]:

    
def getSurveysWithMatchingAnswers(gfDF, _gfDFRow, strictList, extendedList = [], hardPolicy = False):
    questions = strictList

    if (hardPolicy):
        questions += extendedList

    questionsAndPositiveAnswers = []
    for q in questions:
        questionsAndPositiveAnswers.append([q, [_gfDFRow[q]]])

    return getSurveysThatAnswered(gfDF, questionsAndPositiveAnswers, True)



In [ ]:

    
#QAge
#QGender

def getMatchingDemographics(gfDF, _gfDFRow, hardPolicy = False):
    # age and gender, edu should not change
    #QGender
    #QAge
    #QStudiedBiology

    # interests, hobbies, and knowledge - evaluation may vary after playing
    #QInterestVideoGames
    #QPlayVideoGames
    #QInterestBiology
    #QHeardSynBioOrBioBricks heardAboutBioBricksPositives

    # language may vary: players may have missed the opportunity to set it, or may want to try and change it
    #QLanguage

    return getSurveysWithMatchingAnswers(
    gfDF, 
    _gfDFRow, [QAge, QGender, QStudiedBiology], 
    extendedList = [QInterestVideoGames, QPlayVideoGames, QInterestBiology, QHeardSynBioOrBioBricks, QLanguage], 
    hardPolicy = hardPolicy
)

Utility functions to gfDF



In [ ]:

    
def getDemographicSamples(gfDF):
    gfDFs = [
                [gfDF, 'root gfDF'],
                [gfDF[gfDF[QLanguage] == enLanguageID], 'English'],
                [gfDF[gfDF[QLanguage] == frLanguageID], 'French'],
                [gfDF[gfDF[QGender] == 'Female'], 'female'],
                [gfDF[gfDF[QGender] == 'Male'], 'male'],
                [getSurveysOfBiologists(gfDF), 'biologists - strict'],
                [getSurveysOfBiologists(gfDF, False), 'biologists - broad'],
                [getSurveysOfGamers(gfDF), 'gamers - strict'],
                [getSurveysOfGamers(gfDF, False), 'gamers - broad'],
            ]
    return gfDFs



In [ ]:

    
def getTemporalitySamples(gfDF):
    gfDFs = [
                [gfDF, 'root gfDF'],
        
                [getRMBefores(gfDF), 'RedMetrics befores'],
                [getGFormBefores(gfDF), 'Google form befores'],
                [getRMBefores(getGFormBefores(gfDF)), 'GF & RedMetrics befores'],
        
                [getRMAfters(gfDF), 'RedMetrics afters'],
                [getGFormAfters(gfDF), 'Google form afters'],
                [getRMAfters(getGFormAfters(gfDF)), 'GF & RedMetrics afters'],
        
                [getSurveysOfUsersWhoAnsweredBoth(gfDF, gfMode = True, rmMode = False), 'GF both before and after'],
                [getSurveysOfUsersWhoAnsweredBoth(gfDF, gfMode = False, rmMode = True), 'RM both before and after'],
                [getSurveysOfUsersWhoAnsweredBoth(gfDF, gfMode = True, rmMode = True), 'GF & RM both before and after'],
            ]
    return gfDFs

checkpoint validation



In [ ]:

    
#function that returns the list of checkpoints from user id
def getValidatedCheckpoints( userId, _gfDF ):
    _validatedCheckpoints = []
    
    if hasAnswered(userId, _gfDF):
        _columnAnswers = getCorrections( userId, _gfDF = _gfDF)
        
        for _columnName in _columnAnswers.columns:
            # only work on corrected columns
            if correctionsColumnNameStem in _columnName:        
                _questionnaireValidatedCheckpointsPerQuestion = pd.Series(np.nan, index=range(len(checkpointQuestionMatching)))

                for _index in range(0, len(_questionnaireValidatedCheckpointsPerQuestion)):
                    if _columnAnswers[_columnName][_index]==True:
                        _questionnaireValidatedCheckpointsPerQuestion[_index] = checkpointQuestionMatching['checkpoint'][_index]
                    else:
                        _questionnaireValidatedCheckpointsPerQuestion[_index] = ''

                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpointsPerQuestion.unique()
                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints[_questionnaireValidatedCheckpoints!='']
                _questionnaireValidatedCheckpoints = pd.Series(_questionnaireValidatedCheckpoints)
                _questionnaireValidatedCheckpoints = _questionnaireValidatedCheckpoints.sort_values()
                _questionnaireValidatedCheckpoints.index = range(0, len(_questionnaireValidatedCheckpoints))
                
                _validatedCheckpoints.append(_questionnaireValidatedCheckpoints) 
    else:
        print("user " + str(userId) + " has never answered")
    return pd.Series(_validatedCheckpoints)

def getValidatedCheckpointsCounts( _userId, _gfDF ):
    _validatedCheckpoints = getValidatedCheckpoints(_userId, _gfDF = _gfDF)
    _counts = []
    for checkpointsList in _validatedCheckpoints:
        _counts.append(len(checkpointsList))
    return _counts

def getNonValidated( checkpoints ):
    _validationLists = []
    
    if 0!=len(checkpoints):
        for _validation in checkpoints:
            _result = pd.Series(np.setdiff1d(validableCheckpoints.values, _validation.values))
            _result = _result[_result != '']
            _result.index = range(0, len(_result))
            _validationLists.append(_result)
        return pd.Series(_validationLists)
    else:
        return validableCheckpoints

def getNonValidatedCheckpoints( userId, _gfDF ):
    validated = getValidatedCheckpoints( userId, _gfDF = _gfDF )
    return getNonValidated(validated)

def getNonValidatedCheckpointsCounts( userId, _gfDF ):
    _nonValidatedCheckpoints = getNonValidatedCheckpoints(userId, _gfDF = _gfDF)
    _counts = []
    for checkpointsList in _nonValidatedCheckpoints:
        _counts.append(len(checkpointsList))
    return _counts

p(answered question N | answered question P)



In [ ]:

    
# returns all rows of Google form's answers that contain an element 
#   of the array 'choice' for question number 'questionIndex'
def getAllAnswerRows(questionIndex, choice, _gfDF ):
    return _gfDF[_gfDF.iloc[:, questionIndex].isin(choice)]

def getPercentCorrectPerColumn(_df):
    _count = len(_df)
    _percents = pd.Series(np.full(len(_df.columns), np.nan), index=_df.columns)
    for _rowIndex in _df.index:
        for _columnName in _df.columns:
            _columnIndex = _df.columns.get_loc(_columnName)
            if ((_columnIndex >= firstEvaluationQuestionIndex) \
                and (_columnIndex < len(_df.columns)-3)):
                if(str(_df[_columnName][_rowIndex]).startswith(str(correctAnswers[_columnIndex]))):
                    if (np.isnan(_percents[_columnName])):
                        _percents[_columnName] = 1;
                    else:
                        _percents[_columnName] = _percents[_columnName]+1
                else:
                    if (np.isnan(_percents[_columnName])):
                        _percents[_columnName] = 0;
                
    _percents = _percents/_count
    _percents['Count'] = _count
    return _percents

def getPercentCorrectKnowingAnswer(questionIndex, choice, _gfDF):
    _answerRows = getAllAnswerRows(questionIndex, choice, _gfDF = _gfDF);
    return getPercentCorrectPerColumn(_answerRows)

Filtering users



In [ ]:

    
def getTestAnswers( _gfDF, _rmDF, _rmTestDF = normalizedRMDFTest, includeAndroid = True):
    return _gfDF[_gfDF[localplayerguidkey].isin(testUsers.values.flatten())]



In [ ]:

    
# ambiguous answer to QPlayed
AUnclassifiable = 'I played recently on an other computer'

# fill posttests with pretest data
def setPosttestsProfileInfo(_gfDF):
    # check whether temporalities have already been set
    if(len(_gfDF[QTemporality].unique()) == 1):
        print("temporalities not set")
    else:

        intProgress = IntProgress(min=0, max=len(_gfDF.index))
        display(intProgress)

        #_gfDF[_gfDF[QTemporality] == answerTemporalities[1]][QAge]

        for _index in _gfDF.index:
            intProgress.value += 1
            if ((_gfDF.loc[_index, QTemporality] == answerTemporalities[0])
                    or
                    (_gfDF.loc[_index, QTemporality] == answerTemporalities[1]
                        and
                     _gfDF.loc[_index, QPlayed] == AUnclassifiable
                    )
               ):
                if pd.isnull(_gfDF.loc[_index, survey1522DF[profileColumn]]).any():
                    print("nan for index " + str(_index))
                else:
                    # fix on age loading
                    _gfDF.loc[_index, QAge] = int(_gfDF.loc[_index, QAge])

                thisUserIdsPostests = _gfDF.loc[
                        (_gfDF['userId'] == _gfDF.loc[_index, 'userId'])
                        &
                        (_gfDF[QTemporality] == answerTemporalities[1])
                ]

                if(len(thisUserIdsPostests) > 0):
                    _gfDF.loc[
                        (_gfDF['userId'] == _gfDF.loc[_index, 'userId'])
                        &
                        (_gfDF[QTemporality] == answerTemporalities[1])
                        ,survey1522DF[profileColumn]] = _gfDF.loc[_index, survey1522DF[profileColumn]].values

        intProgress.close()
        del intProgress
                    
        print("profile info set")



In [ ]:

    
lastAddedColumn = 'lastAdded'
profileColumn = 'profile'
commonColumn = 'common'
compulsoryPretestColumn = 'compulsoryPretest'
optionalPretestColumn = 'optionalPretest'
compulsoryPosttestColumn = 'compulsoryPosttest'

#QVolunteer
QContent = QBioBricksDevicesComposition
#QRemarks

def getQuestionTypes():

    intProgress = IntProgress(min=0, max=2*len(gform.index))
    display(intProgress)
    
    survey1522DF = pd.DataFrame(index = gform.columns, data = False,
                            columns = [lastAddedColumn, commonColumn, compulsoryPretestColumn,compulsoryPosttestColumn])
    
    pretestQuestions = pd.Index([])
    pretestNotVolunteeredQuestions = pd.Index([])
    posttestQuestions = pd.Index([])
    lastAddedQuestions = pd.Index([])

    for answerIndex in gform.index:
        intProgress.value += 1
        answer = gform.iloc[answerIndex,:]
        
        if gform.loc[answerIndex, QTemporality] == answerTemporalities[0]:
            # has volunteered?
            if gform.loc[answerIndex, QVolunteer] in yesNoPositives:
                pretestQuestions = pretestQuestions.union(answer[pd.notnull(answer[:])].index)
            else:
                pretestNotVolunteeredQuestions = pretestNotVolunteeredQuestions.union(answer[pd.notnull(answer[:])].index)

        elif gform.loc[answerIndex, QPlayed] != APlayedButProfileAgain:
            posttestQuestions = posttestQuestions.union(answer[pd.notnull(answer[:])].index)

    
    survey1522DF[compulsoryPretestColumn] = survey1522DF.index.isin(pretestNotVolunteeredQuestions)
    survey1522DF[optionalPretestColumn] = survey1522DF.index.isin(pretestQuestions.difference(pretestNotVolunteeredQuestions))
    survey1522DF[compulsoryPosttestColumn] = survey1522DF.index.isin(posttestQuestions)
    survey1522DF[commonColumn] = (survey1522DF[compulsoryPretestColumn] & survey1522DF[compulsoryPosttestColumn])
    
    for answerIndex in gform.index:
        intProgress.value += 1
        answer = gform.iloc[answerIndex,:]
        
        if gform.loc[answerIndex, QTemporality] == answerTemporalities[0]:
            # has volunteered?
            if gform.loc[answerIndex, QVolunteer] in yesNoPositives:
                lastAddedQuestions = lastAddedQuestions.union(answer[pretestQuestions][pd.isnull(answer[pretestQuestions])].index)
            else:
                lastAddedQuestions = lastAddedQuestions.union(answer[pretestNotVolunteeredQuestions][pd.isnull(answer[pretestNotVolunteeredQuestions])].index)
        elif not pd.isnull(gform.loc[answerIndex, QContent]):
            lastAddedQuestions = lastAddedQuestions.union(answer[posttestQuestions][pd.isnull(answer[posttestQuestions])].index)

    survey1522DF[lastAddedColumn] = survey1522DF.index.isin(lastAddedQuestions)
    # manual override
    survey1522DF.loc[QRemarks] = False
            
    survey1522DF[profileColumn] = survey1522DF[compulsoryPretestColumn] & (~survey1522DF[compulsoryPosttestColumn])
       
    intProgress.close()
    del intProgress
        
    return survey1522DF

remove answers that are incomplete

e.g. posttests with no content questions or pretests with no profile info



In [ ]:

    
def getPosttestsWithoutPretests(_gfDF):
    pretestIds = _gfDF[_gfDF[QTemporality] == answerTemporalities[0]]['userId']
    posttestIds = _gfDF[_gfDF[QTemporality] == answerTemporalities[1]]['userId']
    return posttestIds[~posttestIds.isin(pretestIds)].index

def getPretestsWithoutPosttests(_gfDF):
    pretestIds = _gfDF[_gfDF[QTemporality] == answerTemporalities[0]]['userId']
    posttestIds = _gfDF[_gfDF[QTemporality] == answerTemporalities[1]]['userId']
    return pretestIds[~pretestIds.isin(posttestIds)].index



In [ ]:

    
def getWithoutIncompleteAnswers(_gfDF):
    
    # remove incomplete profiles
    #  coincidentally removes posttests that don't have matching pretests
    _gfDF2 = _gfDF.drop(_gfDF.index[pd.isnull(_gfDF[_gfDF.columns[survey1522DF[profileColumn]]].T).any()])
    
    # defensive check    
    _gfDF2 = _gfDF2.drop(getPosttestsWithoutPretests(_gfDF2))
    
    return _gfDF2



In [ ]:

    
def getPerfectPretestPostestPairsCount(_gfDF):
    pairs = getPerfectPretestPostestPairs(_gfDF)
    halfPairsCount = len(pairs)//2
    uniqueUserIdsCount = len(pairs['userId'].unique())
    if (halfPairsCount != uniqueUserIdsCount):
        print('warning: halfPairsCount ('+str(halfPairsCount)+') != uniqueUserIdsCount ('+str(uniqueUserIdsCount)+')')
    return uniqueUserIdsCount

Initialization of gform



In [ ]:

    
resetTemporalities(gform)
#setAnswerTemporalities()
#setAnswerTemporalities2()
setAnswerTemporalitiesSimple(gform)
survey1522DF = getQuestionTypes()
setPosttestsProfileInfo(gform)

2. Google form analysis

Table of Contents

Preparation

Constants

Functions

general purpose

sessions and temporalities

score

visualizations

sample getters

set operators

Users who answered either before or after

Users who answered both before and after

Utility functions to gfDF

checkpoint validation

p(answered question N | answered question P)

Filtering users

remove answers that are incomplete

Initialization of gform