Hero.Coli Data Analysis Summary

Interactive list of readworthy results from Hero.Coli data analysis.

Preparation


In [ ]:
%run "../Functions/8. RM-GF correlations.ipynb"
%run "../Functions/Plot.ipynb"

Sample selection


In [ ]:
### Online 1.52.2

#gfdf = gfdfWebgl1522PretestPosttestUniqueProfilesVolunteers.copy()
#rmdf = rmdfWebgl1522PretestPosttestUniqueProfilesVolunteers.copy()

### Playtest

#gfdf = gfdfPlaytestTotalPretestPosttestUniqueProfilesVolunteers.copy()
#gfdf = gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()
#gfdf = gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers.copy()

#rmdf = rmdfPlaytestTotalPretestPosttestUniqueProfilesVolunteers.copy()
#rmdf = rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()
#rmdf = rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers.copy()

### Online 1.60

#gfdf = gfdfWebgl160PretestPosttestUniqueProfilesVolunteers.copy()
#rmdf = rmdfWebgl160PretestPosttestUniqueProfilesVolunteers.copy()

In [ ]:
# For quicker allData switching.
gfdf    =          gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()
rmdf    =          rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()
allData = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()

In [ ]:
# For quicker allData switching.
#gfdf    =          gfdfWebgl1522Timed.copy()
#rmdf    =          rmdfWebgl1522Timed.copy()

1. Google form analysis

Survey counts


In [ ]:
print("sample:               gform")
print("surveys:              %s" % len(gform))
print("unique users:         %s" % getUniqueUserCount(gform))
print("RM before:            %s" % len(gform[gform[QTemporality] == answerTemporalities[0]]))
print("GF before:            %s" % len(getGFormBefores(gform)))
print("RM after:             %s" % len(gform[gform[QTemporality] == answerTemporalities[1]]))
print("GF after:             %s" % len(getGFormAfters(gform)))
print("unique biologists:    %s" % getUniqueUserCount(getSurveysOfBiologists(gform)))
print("unique gamers:        %s" % getUniqueUserCount(getSurveysOfGamers(gform)))
print("unique perfect users: %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gform)))
print("unique perfect users: %s" % getPerfectPretestPostestPairsCount(gform))

In [ ]:
print("sample:               gfdf")
print("surveys:              %s" % len(gfdf))
print("unique users:         %s" % getUniqueUserCount(gfdf))
print("RM before:            %s" % len(gfdf[gfdf[QTemporality] == answerTemporalities[0]]))
print("GF before:            %s" % len(getGFormBefores(gfdf)))
print("RM after:             %s" % len(gfdf[gfdf[QTemporality] == answerTemporalities[1]]))
print("GF after:             %s" % len(getGFormAfters(gfdf)))
print("unique biologists:    %s" % getUniqueUserCount(getSurveysOfBiologists(gfdf)))
print("unique gamers:        %s" % getUniqueUserCount(getSurveysOfGamers(gfdf)))
print("unique perfect users: %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gfdf)))
print("unique perfect users: %s" % getPerfectPretestPostestPairsCount(gfdf))

formatted version for nice display


In [ ]:
print("category | count")
print("--- | ---")
print("sample | gform")
print("surveys | %s" % len(gform))
print("unique users | %s" % getUniqueUserCount(gform))
print("RM before | %s" % len(gform[gform[QTemporality] == answerTemporalities[0]]))
print("GF before | %s" % len(getGFormBefores(gform)))
print("RM after | %s" % len(gform[gform[QTemporality] == answerTemporalities[1]]))
print("GF after | %s" % len(getGFormAfters(gform)))
print("unique biologists | %s" % getUniqueUserCount(getSurveysOfBiologists(gform)))
print("unique gamers | %s" % getUniqueUserCount(getSurveysOfGamers(gform)))
print("unique perfect users | %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gform)))
print("unique perfect users | %s" % getPerfectPretestPostestPairsCount(gform))
print()
#print("(" + str(pd.to_datetime('today').date()) + ")")
print("("+dataFilesNamesStem+")")

In [ ]:
print("category | count")
print("--- | ---")
print("sample | gfdf")
print("surveys | %s" % len(gfdf))
print("unique users | %s" % getUniqueUserCount(gfdf))
print("RM before | %s" % len(gfdf[gfdf[QTemporality] == answerTemporalities[0]]))
print("GF before | %s" % len(getGFormBefores(gfdf)))
print("RM after | %s" % len(gfdf[gfdf[QTemporality] == answerTemporalities[1]]))
print("GF after | %s" % len(getGFormAfters(gfdf)))
print("unique biologists | %s" % getUniqueUserCount(getSurveysOfBiologists(gfdf)))
print("unique gamers | %s" % getUniqueUserCount(getSurveysOfGamers(gfdf)))
print("unique perfect users | %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gfdf)))
print("unique perfect users | %s" % getPerfectPretestPostestPairsCount(gfdf))
print()
#print("(" + str(pd.to_datetime('today').date()) + ")")
print("("+dataFilesNamesStem+")")

1.1 complete sample


In [ ]:
#plotSamples(getDemographicSamples(gfdf))

In [ ]:
#plotSamples(getTemporalitySamples(gfdf))

1.2 Per temporality

1.2.1 answered only before


In [ ]:
gf_befores = getGFormBefores(gfdf)
rm_befores = getRMBefores(gfdf)
gfrm_befores = getRMBefores(getGFormBefores(gfdf))

In [ ]:
(gf_befores[QUserId] == rm_befores[QUserId]).all()

In [ ]:
#plotSamples(getDemographicSamples(gf_befores))

1.2.2 answered only after


In [ ]:
gf_afters = getGFormAfters(gfdf)
rm_afters = getRMAfters(gfdf)
gfrm_afters = getRMAfters(getGFormBefores(gfdf))

In [ ]:
(gf_afters[QUserId] == rm_afters[QUserId]).all()

In [ ]:
#plotSamples(getDemographicSamples(gf_afters))

1.2.3 answered both before and after


In [ ]:
gf_both = getSurveysOfUsersWhoAnsweredBoth(gfdf, gfMode = True, rmMode = False)
rm_both = getSurveysOfUsersWhoAnsweredBoth(gfdf, gfMode = False, rmMode = True)
gfrm_both = getSurveysOfUsersWhoAnsweredBoth(gfdf, gfMode = True, rmMode = True)

In [ ]:
#plotSamples(getDemographicSamples(gf_both))

In [ ]:
#plotSamples(getDemographicSamples(rm_both))

In [ ]:
#plotSamples(getDemographicSamples(gfrm_both))

1.2.4 pretest vs posttest

1.2.4.1 phase1

In [ ]:
matrixToDisplay = plotBasicStats(
    gfdf,
    horizontalPlot=False,
    sortedAlong="",
    figsize=(12,20),
    title = 'percentages of correct answers',
    annot=True,
    annot_kws={"size": 13},
    font_scale=1.3,
);

In [ ]:
matrixToDisplay = plotBasicStats(
    gfdf,
    title = 'percentages of correct answers (sorted)',
    sortedAlong="progression",
    horizontalPlot=False,
    figsize=(12,20),
    annot=True,
    annot_kws={"size": 13},
    font_scale=1.3,
);

In [ ]:
if False:
    #barIndices = matrixToDisplay.index
    barIndices = scientificQuestions
    matrixToDisplaySubset = matrixToDisplay.loc[scientificQuestions,:]
    pretestBars = matrixToDisplaySubset.loc[barIndices, 'pretest']
    posttestBars = matrixToDisplaySubset.loc[barIndices, 'posttest']
    plt.bar(np.arange(len(barIndices)), pretestBars)

In [ ]:
if False:
    # data to plot
    #barIndices = matrixToDisplay.index
    barIndices = scientificQuestions
    matrixToDisplaySubset = matrixToDisplay.loc[scientificQuestions,:]
    pretestBars = matrixToDisplaySubset.loc[barIndices, 'pretest']
    posttestBars = matrixToDisplaySubset.loc[barIndices, 'posttest']
    n_groups = len(barIndices)

    # create plot
    fig, ax = plt.subplots(figsize=(15,7))
    index = np.arange(n_groups)
    bar_width = 0.35
    opacity = 0.8

    rects1 = plt.bar(index, pretestBars, bar_width,
                     alpha=opacity,
                     color='b',
                     label='pretest')

    rects2 = plt.bar(index + bar_width, posttestBars, bar_width,
                     alpha=opacity,
                     color='g',
                     label='posttest')

    plt.xlabel('questions')
    plt.ylabel('percentage correct')
    #plt.title('Percentage correct - pretest and posttest')
    #plt.xticks(index + bar_width, barIndices, rotation='vertical')
    plt.xticks(index + bar_width, questionCategories, rotation='vertical')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [ ]:
barIndicesHardcoded = [
    
       'Device: PCONS:GFP:RBS:TER XXX', 'Device: PBAD:GFP:RBS:TER XXX',
       'Device: AMPR:RBS:PCONS:TER XXX', 'Device: GFP:RBS:PCONS:TER XXX',
       'Device: RBS:PCONS:AMPR:TER XXX', 'Device: RBS:PCONS:FLHDC:TER XXX',
    
    'Function - game: CDS', 'Function: PR', 'Function - biology: CDS',
       'Example: CDS', 'Function: Plasmid', 'Function: TER', 'Function: RBS',
    
    'Name: Operator XXX', 'Name: RBS', 'Name: CDS', 'Name: PR',
       'Name: Plasmid', 'Name: TER',
    
    'Device: PCONS:RBS:FLHDC:TER', 'Device: PBAD:RBS:ARA:TER', 'Device: PBAD:RBS:GFP:TER',       
       
       'Unequip the movement device: effect',
       'BioBricks and devices composition', 'Green fluorescence',
       'Ampicillin antibiotic', 'Genotype and phenotype',]

In [ ]:
if True:
    # data to plot
    #barIndices = matrixToDisplay.index
    barIndices = scientificQuestions
    matrixToDisplaySubset = matrixToDisplay.loc[scientificQuestions,:]
    matrixToDisplaySubset['qCategory'] = questionCategories
    matrixToDisplaySubset = matrixToDisplaySubset.sort_values(by=['qCategory', 'posttest'])
    barIndices = matrixToDisplaySubset.index
    
    barIndices = barIndicesHardcoded
    
    matrixToDisplaySubset = matrixToDisplaySubset.loc[barIndices,:]
    barIndices = matrixToDisplaySubset.index
    pretestBars = matrixToDisplaySubset.loc[barIndices, 'pretest']
    posttestBars = matrixToDisplaySubset.loc[barIndices, 'posttest']
    n_groups = len(barIndices)

    # create plot
    fig, ax = plt.subplots(figsize=(15,7))
    index = np.arange(n_groups)
    bar_width = 0.35
    opacity = 0.8

    rects1 = plt.bar(index, pretestBars, bar_width,
                     alpha=opacity,
                     color='b',
                     label='pretest')

    rects2 = plt.bar(index + bar_width, posttestBars, bar_width,
                     alpha=opacity,
                     color='g',
                     label='posttest')

    #plt.xlabel('questions')
    plt.ylabel('Correct answers')
    #plt.title('Percentage correct - pretest and posttest')
    #plt.xticks(index + bar_width, barIndices.map(questionCategoriesDictionary), rotation='vertical')
    plt.xticks(index + bar_width, barIndices, rotation='vertical')
    #plt.xticks(index + bar_width, questionCategories, rotation='vertical')
    plt.legend()

    plt.tight_layout()
    plt.show()

In [ ]:
if True:

    # data to plot
    #barIndices = matrixToDisplay.index
    barIndices = scientificQuestions
    matrixToDisplaySubset = matrixToDisplay.loc[scientificQuestions,:]
    matrixToDisplaySubset['qCategory'] = questionCategories
    matrixToDisplaySubset = matrixToDisplaySubset.sort_values(by=['qCategory', 'posttest'])
    barIndices = matrixToDisplaySubset.index

    for qCategory in set(questionCategories):
        questionsOfThatCategory = [k for k, v in questionCategoriesDictionary.items() if v == qCategory]
        barIndices = questionsOfThatCategory
        bars =  matrixToDisplaySubset.loc[barIndices,:].sort_values(by=['posttest'], ascending=False)
        barIndices = bars.index
        pretestBars = bars['pretest']
        posttestBars = bars['posttest']
        n_groups = len(barIndices)

        # create plot
        fig, ax = plt.subplots(figsize=(12, n_groups*2))
        plt.xlim(0,85)
        index = np.arange(len(questionsOfThatCategory))
        bar_width = 0.35
        opacity = 0.8

        rects1 = plt.barh(index + bar_width, pretestBars, bar_width,
                         alpha=opacity,
                         color='b',
                         label='pretest')

        rects2 = plt.barh(index, posttestBars, bar_width,
                         alpha=opacity,
                         color='g',
                         label='posttest')


        plt.xlabel('correct answers (%)')

        #plt.yticks(index + bar_width, barIndices)
        plt.legend()

        plt.tight_layout()
        plt.show()
        fig.savefig("score pretest posttest h big nolabel " + qCategory)

In [ ]:
#matrixToDisplay.to_csv("../../data/sortedPrePostProgression.csv")

In [ ]:
#matrixToDisplay.T

1.3 Per demography

1.3.1 English speakers


In [ ]:
cohortEN = gfdf[gfdf[QLanguage] == enLanguageID]

In [ ]:
#plotSamples(getTemporalitySamples(cohortEN))

1.3.2 French speakers


In [ ]:
cohortFR = gfdf[gfdf[QLanguage] == frLanguageID]

In [ ]:
#plotSamples(getTemporalitySamples(cohortFR))

1.3.3 Female


In [ ]:
cohortF = gfdf[gfdf[QGender] == 'Female']

In [ ]:
#plotSamples(getTemporalitySamples(cohortF))

1.3.4 Male


In [ ]:
cohortM = gfdf[gfdf[QGender] == 'Male']

In [ ]:
#plotSamples(getTemporalitySamples(cohortM))

1.3.5 biologists

strict

In [ ]:
cohortBioS = getSurveysOfBiologists(gfdf)

In [ ]:
#plotSamples(getTemporalitySamples(cohortBioS))
broad

In [ ]:
cohortBioB = getSurveysOfBiologists(gfdf, False)

In [ ]:
#plotSamples(getTemporalitySamples(cohortBioB))

1.3.6 gamers

strict

In [ ]:
cohortGamS = getSurveysOfGamers(gfdf)

In [ ]:
#plotSamples(getTemporalitySamples(cohortGamS))
broad

In [ ]:
cohortGamB = getSurveysOfGamers(gfdf, False)

In [ ]:
#plotSamples(getTemporalitySamples(cohortGamB))

In [ ]:
#T-tests between pretest and posttest scores among some player groups
plotBasicStats(gfdf, horizontalPlot=True, sortedAlong="progression", figsize=(20,4));
plotBasicStats(cohortF, horizontalPlot=True, sortedAlong="progression", figsize=(20,4));
plotBasicStats(cohortM, horizontalPlot=True, sortedAlong="progression", figsize=(20,4));
plotBasicStats(cohortGamB, horizontalPlot=True, sortedAlong="progression", figsize=(20,4));

1.4 answered only after

1.1 answers to scientific questions


In [ ]:
sciBinarizedBefore = getAllBinarized(getRMBefores(gfdf))
#sciBinarizedBefore = getAllBinarized(getGFBefores())

In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        sciBinarizedBefore,
                        _abs=False,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlations on survey questions before',
                    )

plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):

thisClustermap, overlay = plotCorrelationMatrix( sciBinarizedBefore, _abs=True, _clustered=True, _questionNumbers=True, _annot = True, _figsize = (20,20), _metric='correlation' )


In [ ]:
sciBinarizedAfter = getAllBinarized(getRMAfters(gfdf))

In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        sciBinarizedAfter,
                        _abs=False,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlations on survey questions after',
                    )

In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
thisClustermap, overlay = plotCorrelationMatrix(
                        sciBinarizedAfter,
                        _abs=False,
                        _clustered=True,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _metric='correlation'
                    )

thisClustermap.ax_heatmap.annotate(overlay)

dir(thisClustermap)

dir(thisClustermap.ax_heatmap)

vars(thisClustermap)

vars(thisClustermap.ax_heatmap)

1.2 answers to all questions


In [ ]:
allQuestions = correctAnswers + demographicAnswers

allBinarized = getAllBinarized(gfdf, _source = allQuestions)
allBinarizedBefore = getAllBinarized(getRMBefores(gfdf), _source = allQuestions)
allBinarizedAfter = getAllBinarized(getRMAfters(gfdf), _source = allQuestions)

In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        allBinarized,
                        _abs=True,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlation of all answers',
                    )

thisClustermap, overlay = plotCorrelationMatrix( allBinarizedAfter, _abs=True, _clustered=True, _questionNumbers=True, _annot = True, _figsize = (20,20), _metric='correlation' )

1.3 answers to all questions, only before having played


In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
                        allBinarizedBefore,
                        _abs=False,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlations on all questions before',
                    )

thisClustermap, overlay = plotCorrelationMatrix( allBinarizedBefore, _abs=True, _clustered=True, _questionNumbers=True, _annot = True, _figsize = (20,20), _metric='correlation' )

1.4 answers to all questions, only after having played


In [ ]:
plotCorrelationMatrix(
                        allBinarizedAfter,
                        _abs=False,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlation of all answers after',
                    )

In [ ]:
allBinarizedAfterSub = allBinarizedAfter.copy()
allBinarizedAfterSub = allBinarizedAfterSub.loc[:,['Age'] + scientificQuestions]

In [ ]:
plotCorrelationMatrix(
                        allBinarizedAfterSub,
                        _abs=False,
                        _clustered=False,
                        _questionNumbers=True,
                        _annot = True,
                        _figsize = (20,20),
                        _title='Correlation of all answers after',
                    )

In [ ]:

2. Game sessions


In [ ]:
#startDate = minimum152Date
#endDate = maximum152Date

startDate = rmdf['userTime'].min().date() - datetime.timedelta(days=1)
endDate = rmdf['userTime'].max().date() + datetime.timedelta(days=1)

In [ ]:
valuesPerDay = rmdf['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='RedMetrics events', startDate=startDate, endDate=endDate)

In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]

In [ ]:
valuesPerDay = rmdf[rmdf['type'] == 'start']['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='sessions', startDate=startDate, endDate=endDate)

In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]

In [ ]:
valuesPerDay = rmdf.groupby('userId').agg({ "userTime": np.min })['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='game users', startDate=startDate, endDate=endDate)

In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]

In [ ]:
valuesPerDay = gfdf.groupby(localplayerguidkey).agg({ QTimestamp: np.min })[QTimestamp].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='survey answers', startDate=startDate, endDate=endDate)

In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]

In [ ]:
beforesPerDay = gfdf[gfdf[QTemporality] == answerTemporalities[0]].groupby(localplayerguidkey).agg({ QTimestamp: np.min })[QTimestamp].map(lambda t: t.date()).value_counts().sort_index()
aftersPerDay = gfdf[gfdf[QTemporality] == answerTemporalities[1]].groupby(localplayerguidkey).agg({ QTimestamp: np.min })[QTimestamp].map(lambda t: t.date()).value_counts().sort_index()
undefinedPerDay = gfdf[gfdf[QTemporality] == answerTemporalities[2]].groupby(localplayerguidkey).agg({ QTimestamp: np.min })[QTimestamp].map(lambda t: t.date()).value_counts().sort_index()

plotPerDay(beforesPerDay, title='survey befores', startDate=startDate, endDate=endDate)
plotPerDay(aftersPerDay, title='survey afters', startDate=startDate, endDate=endDate)
plotPerDay(undefinedPerDay, title='survey undefined', startDate=startDate, endDate=endDate)

In [ ]:

3. Per session and per user analysis

4. User comparison

to do: transfer part of 1.3's "'Google form analysis' functions tinkering" code here

percentagesCrossCorrect


In [ ]:
#pretests = gform[gform[QTemporality] == answerTemporalities[0]]
#pretests[pretests[QBBFunctionPlasmid] == ]

In [ ]:
binarized = sciBinarizedBefore
intermediaryNumerator = getCrossCorrectAnswers(binarized).round().astype(int)*100
percentagesCrossCorrect = (intermediaryNumerator / binarized.shape[0]).round().astype(int)
totalPerQuestion = np.dot(np.ones(binarized.shape[0]), binarized)
sciBinarizedBefore.columns[totalPerQuestion == 0]

In [ ]:
getPercentageCrossCorrect(sciBinarizedBefore, figsize=(40,40))

In [ ]:
getPercentageCrossCorrect(sciBinarizedAfter, figsize=(40,40))

In [ ]:
len(gfdf), len(getAllResponders(gfdf))

In [ ]:
matrixToDisplay = plotBasicStats(gfdf, horizontalPlot=True, sortedAlong="progression", figsize=(20,4));

In [ ]:
subjectCount = allData.shape[1]
measuredPretest = 100*allData.loc[pretestScientificQuestions,:].sum(axis='columns')/subjectCount
measuredPretest.index = scientificQuestions
measuredPosttest = 100*allData.loc[posttestScientificQuestions,:].sum(axis='columns')/subjectCount
measuredPosttest.index = scientificQuestions
measuredDelta2 = (measuredPosttest - measuredPretest)
measuredDelta2 = pd.DataFrame(measuredDelta2.round().astype(int))
measuredDelta2.columns = ["measuredDelta2"]
measuredDelta2 = measuredDelta2.sort_values(by = "measuredDelta2", ascending = True).T
_fig = plt.figure(figsize=(20,2))
_ax1 = plt.subplot(111)
_ax1.set_title("measuredDelta2")
sns.heatmap(
            measuredDelta2,
            ax=_ax1,
            cmap=plt.cm.jet,
            square=True,
            annot=True,
            fmt='d',
            vmin=0,
            vmax=100,
        )

In [ ]:
#(matrixToDisplay.loc['progression',scientificQuestions] - measuredDelta2.loc['measuredDelta2',scientificQuestions])

In [ ]:
testDF = pd.DataFrame(columns=[
    'pretest1', 'posttest1', 'measuredDelta',
    'pretest2', 'posttest2', 'matrixToDisplay'], data = 0, index= scientificQuestions)
testDF['pretest1'] = measuredPretest
testDF['posttest1'] = measuredPosttest
testDF['measuredDelta'] = measuredDelta2.T['measuredDelta2']
testDF['pretest2'] = matrixToDisplay.T['pretest'][scientificQuestions]
testDF['posttest2'] = matrixToDisplay.T['posttest'][scientificQuestions]
testDF['matrixToDisplay'] = matrixToDisplay.T['progression'][scientificQuestions]
testDF = testDF.round().astype(int)
#testDF

In [ ]:
measuredDelta = allData.loc[deltaScientificQuestions,:].sum(axis='columns')
measuredDelta.mean(), measuredDelta.median()
#measuredDelta.sort_values()

In [ ]:


In [ ]:
#pretestData = getAllUserVectorData( gfdf[gfdf[QTemporality] == answerTemporalities[0]], _source = correctAnswers )
#posttestData = getAllUserVectorData( gfdf[gfdf[QTemporality] == answerTemporalities[1]], _source = correctAnswers )

In [ ]:
plotAllUserVectorDataCorrelationMatrix(
    allData.T,
    _abs=False,
    _figsize = (40,40),
    _clustered=False
)

In [ ]:
demographicCriteria = demographicQuestions.copy()

plotAllUserVectorDataCorrelationMatrix(
    allData.T,
    _abs=False,
    _figsize = (20,20),
    _clustered=False,
    columnSubset=[]\
        + completionTimesCriteria
        + totalTimesCriteria
        + pretestScientificQuestions
        #+ posttestScientificQuestions
        #+ deltaScientificQuestions
        + overallScoreCriteria
        #+ demographicCriteria
)

In [ ]:
#completers = rmdf[rmdf['type'] == 'complete'][QUserId]
#nonCompleter = rmdf[~rmdf[QUserId].isin(completers)][QUserId].iloc[0]

In [ ]:
#getUserDataVector(nonCompleter)#.loc[14,:]

In [ ]:
#allData.shape

In [ ]:
#allData.index

completed vs played time


In [ ]:
data = pd.DataFrame(index=allData.columns, columns=["time", "posttestScore", "deltaScore","completed"])
for userId in data.index:
    data.loc[userId, "time"] = getPlayedTimeUser(userId, _rmDF = rmdf)['tutorial']['totalSpentTime'].total_seconds()
    data.loc[userId, "posttestScore"] = allData.loc['scoreposttest', userId]
    data.loc[userId, "pretestScore"] = allData.loc['scorepretest', userId]
    data.loc[userId, "deltaScore"] = allData.loc['scoredelta', userId]
    data.loc[userId, "completed"] = allData.loc['complete', userId]
data.shape

x = allScores.copy() x2 = completedScores.copy() y = allPlayedTimes.copy() y2 = completedPlayedTimes.copy()

plotDF = pd.DataFrame(index = x.index, data = x) plotDF['times'] = y

plotDF

(plotDF['times'] == y).all()


In [ ]:
x = data["posttestScore"]
x2 = data[data["completed"]==1]["posttestScore"]
y = data["time"]
y2 = data[data["completed"]==1]["time"]

plt.figure(figsize=(12, 4))
ax1 = plt.subplot(121)
plt.scatter(x, y)#, c='blue', alpha=0.5)
plt.scatter(x2, y2)#, c='red', alpha=0.5)
plt.xlabel('score')
plt.ylabel('time')
plt.title("time against score, n=" + str(len(x)))
#ax1.legend(loc='center left', bbox_to_anchor=(1, 0.5))

ax2 = plt.subplot(122)
plt.scatter(y, x)
plt.scatter(y2, x2)
plt.xlabel('time')
plt.ylabel('score')
plt.title("score against time, n=" + str(len(x)))
ax2.legend(loc='center left', bbox_to_anchor=(-1.2, 0.9), labels =["unfinished games","completed games"])

plt.show()

linear regression


In [ ]:
x = data["posttestScore"].astype(float)
x2 = data[data["completed"]==1]["posttestScore"].astype(float)
y = data["time"].astype(float)
y2 = data[data["completed"]==1]["time"].astype(float)

# Get the linear models
lm_original = np.polyfit(x, y, 1)
 
# calculate the y values based on the co-efficients from the model
r_x, r_y = zip(*((i, i*lm_original[0] + lm_original[1]) for i in x))
 
# Put in to a data frame, to keep is all nice
lm_original_plot = pd.DataFrame({
'scores' : r_x,
'times' : r_y
})

lm_original_plot = lm_original_plot.drop_duplicates()
lm_original_plot = lm_original_plot.sort_values(by="scores")
lm_original_plot = lm_original_plot.drop(lm_original_plot.index[1:-1])

In [ ]:
plt.figure(figsize=(6, 4))
ax = plt.subplot(111)
plt.scatter(x, y)
plt.scatter(x2, y2)
# Plot the original data and model
#lm_original_plot.plot(kind='line', color='Red', x='scores', y='times', ax=ax)
plt.plot('scores', 'times', data=lm_original_plot, color='Red')
plt.xlabel('score')
plt.ylabel('time') 
plt.show()

linear regression 2


In [ ]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

x = data["posttestScore"].astype(float)
x2 = data[data["completed"]==1]["posttestScore"].astype(float)
y = data["time"].astype(float)
y2 = data[data["completed"]==1]["time"].astype(float)

xReshaped = x.values.reshape(-1, 1)

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(xReshaped, y)

# Make predictions using the testing set
pred = regr.predict(xReshaped)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y, pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y, pred))

# Plot outputs
plt.scatter(x, y, color='black')
plt.plot(x, pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

In [ ]:
regr.intercept_,regr.coef_

linear regression 3


In [ ]:
sns.regplot(x=x, y=y, color="b")
plt.scatter(x2, y2, color='red')
plt.xlabel("score")
plt.ylabel("time played")

data = pd.DataFrame(index = range(0, len(xReshaped)), data = xReshaped, columns = ['score'])

data['time'] = y.values

data

data2 = data.loc[:, ["time", "posttestScore"]] data2.index = range(0, data.shape[0])

data2

linear regression 4


In [ ]:
#import patsy
import statsmodels.formula.api as smf

In [ ]:
data2 = data.astype(float)

In [ ]:
### STATSMODELS ###

timeScoreformula = 'time ~ posttestScore'

# create a fitted model
lm1 = smf.ols(formula=timeScoreformula, data=data2).fit()

# print the coefficients
#lm1.params


#lm1.summary()

In [ ]:
# print the confidence intervals for the model coefficients
lm1.conf_int()

In [ ]:
# print the p-values for the model coefficients
# Represents the probability that the coefficient is actually zero
lm1.pvalues

In [ ]:
# print the R-squared value for the model
lm1.rsquared

Completed vs non-completed


In [ ]:
### STATSMODELS ###
timeScoreformula = 'time ~ posttestScore'
lm1 = smf.ols(formula=timeScoreformula, data=data2).fit()
lm2 = smf.ols(formula=timeScoreformula, data=data2[data2["completed"] == 0]).fit()
lm3 = smf.ols(formula=timeScoreformula, data=data2[data2["completed"] == 1]).fit()
lm1.rsquared,lm2.rsquared,lm3.rsquared

Score increase


In [ ]:
data['deltaScoreRate'] = data['deltaScore']/data['pretestScore']
meanDelta = data['deltaScore'].mean()
meanPretest = data['pretestScore'].mean()
meanDelta/meanPretest

Correlations between durations and score on questions

correlations between completion time of checkpoint n and score on question Q


In [ ]:
overallScoreCriteria = ["scorepretest", "scoreposttest", "scoredelta",]

In [ ]:
stemTimesCriteria = ["ch" + "{0:0=2d}".format(i) for i in range(0,15)]
completionTimesCriteria = [st + "completion" for st in stemTimesCriteria] + ["completionTime"]
totalTimesCriteria = [st + "total" for st in stemTimesCriteria] + ["totalTime"]

In [ ]:
allData2 = allData.T.rename(str,axis="columns")
allData3 = allData2[allData2['ch00completion'] < pd.Timedelta.max.total_seconds()]
len(allData3)

In [ ]:
allData2[allData2[criterionLabel]>9e+09]

In [ ]:
sectionNb = '01'
criterionLabel = 'ch' + sectionNb + 'completion'
sectionName = 'tutorial.Checkpoint' + sectionNb
testUserId = allData2[allData2[criterionLabel]>9e+09].index[0]
#rmdf or rmdfConcat
_rmdf = rmdfConcat
_rmdf[(_rmdf[QUserId] == testUserId) \
           & (_rmdf['type'] == 'reach') \
           & (_rmdf['section'] == 'tutorial.Checkpoint' + sectionNb) \
          ].loc[:, ['section', 'userTime']]

In [ ]:
testUserId

In [ ]:
_rmdf[(_rmdf[QUserId] == testUserId)]

In [ ]:
gfdf[gfdf[QUserId] == testUserId]

In [ ]:
#chosenPrefix = answerTemporalities[0]
chosenPrefix = answerTemporalities[1]
#chosenPrefix = "delta"

#warning: not the same as displayed columns, see lower
chosenCriteria = [chosenPrefix + " " + q for q in scientificQuestions] + overallScoreCriteria

durationsScoresCorrelations = pd.DataFrame(index=completionTimesCriteria+totalTimesCriteria, columns=chosenCriteria, data=np.nan)
durationsScoresCorrelations = durationsScoresCorrelations.rename(str, axis='rows')
annotationMatrix = np.empty(shape=[durationsScoresCorrelations.shape[0], 1], dtype=int)
#annotationMatrix2D = np.empty(durationsScoresCorrelations.shape, dtype=str)

allData2 = allData.T.rename(str,axis="columns")
for i in range(len(durationsScoresCorrelations.index)):
    checkpoint = durationsScoresCorrelations.index[i]
    allData3 = allData2[allData2[checkpoint] < pd.Timedelta.max.total_seconds()]
    annotationMatrix[i] = len(allData3)
    for q in durationsScoresCorrelations.columns:
        corr = np.corrcoef(allData3[checkpoint], allData3[q])
        if corr[0,0] < 0:
            print("[" + checkpoint + ";" + q + "]:" + str(corr[0,0]))
        #if pd.isnull(corr[0,1]):
        #    print("[" + checkpoint + ";" + q + "] null")
        durationsScoresCorrelations.loc[checkpoint, q] = corr[0,1]
        
_fig, (_a0, _a1) = plt.subplots(1,2, gridspec_kw = {'width_ratios':[50, 1]}, figsize=(15,10))

#_a0.set_title("correlations between times and " + chosenPrefix + " scores")
_a0.set_title("correlations between times and scores")

durationsScoresCorrelations.columns = [q for q in scientificQuestions] + ["pretest score", "posttest score", "score increase",]
sns.heatmap(durationsScoresCorrelations, ax=_a0, cmap=plt.cm.jet, square=True, vmin=-1, vmax=1,
            # annot=True,
            # annot=annotationMatrix2D
            #cbar_kws= {'panchor':(0.0, 0.0)}
           )

_a1.set_title("")
sns.heatmap(annotationMatrix, ax=_a1, annot=annotationMatrix)

_fig.tight_layout()

In [ ]:
#chosenPrefix = answerTemporalities[0]
#chosenPrefix = answerTemporalities[1]
#chosenPrefix = "delta"

#warning: not the same as displayed columns, see lower
#questions1 = [QAge,QGender]
#questions1 = [QEnjoyed]

#questions2 = [
#    QCuriosityBiology,QCuriositySyntheticBiology,QCuriosityVideoGames,
#    QCuriosityEngineering,
##    QPlayed,
#    QAge,QGender,
#    QInterestVideoGames,
#    QInterestBiology,QStudiedBiology,QPlayVideoGames,
##    QHeardSynBioOrBioBricks,
##    QVolunteer,
#    QEnjoyed]

#questions2 = [
#    QCuriosityBiology,
#    QCuriositySyntheticBiology,
#    QCuriosityVideoGames,
#    QCuriosityEngineering,
#    QPlayed,
    #QAge,
    #QGender,
#    QInterestVideoGames,
#    QInterestBiology,
#    QStudiedBiology,
#    QPlayVideoGames,
#    QHeardSynBioOrBioBricks,
#    QVolunteer,
#    QEnjoyed #use only posttest value
#    ]

questions2 = [
    QCuriosityEngineering,
    
    QCuriosityBiology,
    QCuriositySyntheticBiology,
    QInterestBiology,
    QStudiedBiology,
    
    QCuriosityVideoGames,
    QInterestVideoGames,
    QPlayVideoGames,
    
#    QPlayed,
    QAge,
    QGender,
#    QHeardSynBioOrBioBricks,
#    QVolunteer,
#    QEnjoyed #use only posttest value
    ]



#chosenCriteria1 = completionTimesCriteria+totalTimesCriteria
#chosenCriteria1 = ["posttest " + q for q in scientificQuestions] + overallScoreCriteria
chosenCriteria1 = overallScoreCriteria
#chosenCriteria1 = ["pretest " + q for q in questions1]
#chosenCriteria1 = ["posttest " + q for q in questions1]

#chosenCriteria2 = ["posttest " + q for q in questions2]
#chosenCriteria2 = ["pretest " + q for q in questions2] + ["posttest " + QEnjoyed]
#chosenCriteria2 = ["posttest " + q for q in scientificQuestions] + overallScoreCriteria
chosenCriteria2 = ["pretest " + q for q in questions2]
#chosenCriteria2 = ["maxChapter"]

criteriaScoresCorrelations = pd.DataFrame(index=chosenCriteria1, columns=chosenCriteria2, data=np.nan)
criteriaScoresCorrelations = criteriaScoresCorrelations.rename(str, axis='rows')
annotationMatrix = np.empty(shape=[criteriaScoresCorrelations.shape[0], 1], dtype=int)
#annotationMatrix2D = np.empty(durationsScoresCorrelations.shape, dtype=str)

#allData2 = allData.T.rename(str,axis="columns")
#allData2 = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers
allData2 = allNumericDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers
allData2 = allData2.T.rename(str,axis="columns")

for i in range(len(criteriaScoresCorrelations.index)):
    criterion1i  = criteriaScoresCorrelations.index[i]
    
    allData3 = allData2
    if criterion1i in completionTimesCriteria:
        allData3 = allData2[allData2[criterion1i] < pd.Timedelta.max.total_seconds()]
    annotationMatrix[i] = len(allData3)
    
    for criterion2j in criteriaScoresCorrelations.columns:
        corr = np.corrcoef(allData3[criterion1i], allData3[criterion2j])
        if corr[0,0] < 0:
            print("[" + criterion1i + ";" + criterion2j + "]:" + str(corr[0,0]))
        #if pd.isnull(corr[0,1]):
        #    print("[" + criterion1i + ";" + criterion2j + "] null")
        criteriaScoresCorrelations.loc[criterion1i, criterion2j] = corr[0,1]
        
#index 1
#criteriaScoresCorrelations.index = scientificQuestions + ["pretest score", "posttest score", "score increase"]
#criteriaScoresCorrelations.index = questions1
criteriaScoresCorrelations.index = ["pretest score", "posttest score", "score increase"]

#columns 2
criteriaScoresCorrelations.columns = questions2
#criteriaScoresCorrelations.columns = questions2 + [QEnjoyed]
#criteriaScoresCorrelations.columns = scientificQuestions + ["pretest score", "posttest score", "score increase"]
#criteriaScoresCorrelations.columns = ["max. checkpoint"]

# (10,20) big
# (12,5) small
#_fig, (_a0) = plt.subplots(1,1, figsize=(10,18))
_fig, (_a0) = plt.subplots(1,1, figsize=(6,10))
#_fig, (_a0, _a1) = plt.subplots(
#    1,2, figsize=(5,25), gridspec_kw = {'width_ratios':[15, 1]})
#    2,1, figsize=(17,12), gridspec_kw = {'height_ratios':[30, 1]})

#sns.set(font_scale=1)
#sns.set(font_scale=1.3)
sns.set(font_scale=1.7)
data = criteriaScoresCorrelations.T

#_a0.set_title("correlations between times and demographic criteria")
#_a0.set_title("correlations between scores and demographic criteria")
#_a0.set_title("correlations between (age, gender) and (curiosity, interest, practice, enjoyment)")
#_a0.set_title("correlations between enjoyment and age, gender, curiosity, interest, practice, enjoyment")
#plt.title("correlations between enjoyment and age, gender, curiosity, interest, practice")
#_a0.set_title("correlations between times and scores")
#_a0.set_title("correlations between scores and maximum checkpoint reached")

_a0.set_anchor('C')
sns.heatmap(data, ax=_a0,
            #cmap=plt.cm.jet,
            cmap="RdBu_r",
            square=True, vmin=-1, vmax=1,
             annot=True,
            # cbar = False,
            # annot=annotationMatrix2D
            #cbar_kws= {'panchor':(0.0, 0.0)}
            #cbar_kws = dict(use_gridspec=False,location="right"),
            annot_kws={"size": 13},
            #annot_kws={"size": 13},
           )

#_a1.set_anchor('C')
#data = annotationMatrix.T
#sns.heatmap(data, ax=_a1, annot=data, square=True,
#             cbar = False,xticklabels=False,yticklabels=False,annot_kws={"size": 12})

_fig.tight_layout()

In [ ]:


In [ ]:
_fig, (_a1) = plt.subplots(1,1, figsize=(10,5))
_a1.set_anchor('C')
#data = pd.Series(data=annotationMatrix.flatten(), index=completionTimesCriteria+totalTimesCriteria)
data = annotationMatrix.T
sns.heatmap(data, 
            ax=_a1,
            annot=data,
            square=True,
             cbar = False,
            #xticklabels=False,
            xticklabels=completionTimesCriteria+totalTimesCriteria,
            yticklabels=False,
            #yticklabels=completionTimesCriteria+totalTimesCriteria,
            annot_kws={"size": 12})

_fig.tight_layout()

In [ ]:
i = 0
checkpoint = durationsScoresCorrelations.index[i]
print(checkpoint + ": " + str(len(allData2[allData2[checkpoint] < pd.Timedelta.max.total_seconds()])))

In [ ]:
testUserId = gfdf[QUserId].unique()[12]
getCheckpointsTotalTimesUser(testUserId, rmdf)

In [ ]:
#timedSectionnedEvents.to_csv("ch4.csv", encoding=csvEncoding)

In [ ]:
#getAllResponders(gfdf), _source = correctAnswers, _rmDF = rmdf
#testUserId = "4731525f-62dd-4128-ab56-3991b403e17e"
#getUserDataVector(testUserId,_source = correctAnswers, _rmDF = rmdf)

max chapter vs scores


In [ ]:
# delta or posttest?
# posttest: values 0, 1 managed in plotCorrectedAnswerPerMaxCheckpoint
# delta can't work: values 0, 1 and -1 not managed in plotCorrectedAnswerPerMaxCheckpoint
chosenPrefix = "posttest"
chosenQuestions = [chosenPrefix + " " + q for q in scientificQuestions]
criteria = ["maxChapter","complete"] + chosenQuestions + overallScoreCriteria

#data = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfiles.loc[criteria,:]
data = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.loc[criteria,:]

#data = allBinaryDataPlaytestPhase2PretestPosttestUniqueProfiles.loc[criteria,:]
#data = allBinaryDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers.loc[criteria,:]

#data = allNumericDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.loc[criteria,:]

In [ ]:
criterion1 = chosenPrefix + ' Function - biology: CDS'
criterion2 = chosenPrefix + ' Device: RBS:PCONS:FLHDC:TER XXX'
xIndex = 'maxChapter'
dataT = data.T

In [ ]:
def getCheckpointThreshold(bars0, bars1, thresholdRatio = .9):
    totalCount = np.sum(bars0) + np.sum(bars1)
    cumulative0 = np.cumsum(list(reversed(bars0)))
    cumulative1 = np.cumsum(list(reversed(bars1)))
    result = 0
    #np.argmax(cumulative1>=thresholdCount)
    for i in range(len(cumulative1)):
        thresholdCount = np.floor(thresholdRatio * (cumulative0[i] + cumulative1[i]))
        if cumulative1[i] < thresholdCount:
            result = 15-i
            break
    return result

In [ ]:
from matplotlib import rc
import pandas as pd
 
# stacked horizontal bar plot; cf df.plot.barh?
def plotCorrectedAnswerPerMaxCheckpoint(dataT, criterion, saveFig=False, plotFig=True, thresholdRatio=0.9):
    # y-axis in bold
    rc('font', weight='bold')

    # Values of each group

    bars0 = [len(dataT[(dataT[criterion]==0) & (dataT[xIndex]==maxChapterValue)]) for maxChapterValue in range(15)]
    bars1 = [len(dataT[(dataT[criterion]==1) & (dataT[xIndex]==maxChapterValue)]) for maxChapterValue in range(15)]

    if plotFig:
        # Heights of bars1 + bars2 (TO DO better)
        bars = [bars0[i] + bars1[i] for i in range(len(bars0))]

        # The position of the bars on the x-axis
        r = [i for i in range(15)]

        # Names of group and bar width
        names = [i for i in range(15)]
        barWidth = 1

        fig, ax = plt.subplots(1,1, figsize=(10,6))

        # Create red bars
        ax.bar(r, bars0, color='#cc0c28', edgecolor='white', width=barWidth)
        # Create green bars (middle), on top of the firs ones
        ax.bar(r, bars1, bottom=bars0, color='#557f2d', edgecolor='white', width=barWidth)

        # Custom X axis
        plt.xticks(r, names, fontweight='bold')
        plt.xlabel("max. checkpoint")
        plt.ylabel("count")
        plt.title("Answers to question '" + criterion + "' against max. checkpoint, n=" + str(len(dataT.index)))
        ax.legend(["incorrect", "correct"], 
                    bbox_to_anchor=(0.7, 0.7),
    #                loc="upper center",
                )

        # Show graphic
        plt.show()

        if saveFig:
            #correctedAnswersPerMaxCheckpoint
            questionTitle = "cAPMC-'" + criterion.replace(" ", "_").replace(":", "") + "'"
            try:
                fig.savefig(questionTitle)
            except:
                print("- savefig failed for " + questionTitle)
            
    return [bars0, bars1, getCheckpointThreshold(bars0, bars1, thresholdRatio)]

[bars0, bars1, threshold] = plotCorrectedAnswerPerMaxCheckpoint(dataT, criterion2, saveFig=False, plotFig=True)
threshold

In [ ]:
getCheckpointThreshold(bars0, bars1, thresholdRatio = 1)

In [ ]:
np.cumsum(bars1)

In [ ]:
thresholdsCheckpoints = pd.Series(index = chosenQuestions, data = 15, name = "thresholdsCheckpoints")

for criterion in chosenQuestions:
    [bars0, bars1, threshold] = plotCorrectedAnswerPerMaxCheckpoint(
        dataT,
        criterion,
        saveFig=False,
        plotFig=False,
        thresholdRatio=0.8
    )
    thresholdsCheckpoints[criterion] = threshold
thresholdsCheckpoints

In [ ]:
thresholdsCheckpoints

In [ ]:
def plotCheckpointsFromThreshold(dataT, criterion, saveFig=False):
    xs = []
    ys = []
    for x in np.linspace(0.5,1,11):
        [bars0, bars1, thresholdCheckpoint] = plotCorrectedAnswerPerMaxCheckpoint(
            dataT,
            criterion,
            saveFig=False,
            plotFig=False,
            thresholdRatio=x
        )
        xs += [x]
        ys += [thresholdCheckpoint]
        #print("x=" + str(x) +": " + str(thresholdCheckpoint))
    fig = plt.figure(figsize=(12, 4))
    ax1 = plt.subplot(111)
    plt.plot(xs, ys)
    plt.ylim((-0.5, 14.5))
    plt.xlabel('threshold')
    plt.ylabel('checkpoint')
    plt.title("Checkpoint against threshold, for question '" + criterion + "'")
    plt.show()
    
    if saveFig:
            #correctedAnswersPerMaxCheckpoint
            questionTitle = "cFT-'" + criterion.replace(" ", "_").replace(":", "") + "'"
            try:
                fig.savefig(questionTitle)
            except:
                print("- savefig failed for " + questionTitle)
    return ys
                
ys = plotCheckpointsFromThreshold(dataT, criterion2)

In [ ]:
def getMostFrequentThreshold(ys):
    result = [x for x in ys if ((x != 15) & (x != 0))]
    if len(result) == 0:
        return Counter(ys).most_common(1)[0]
    else:
        return Counter(result).most_common(1)[0]

In [ ]:
from collections import Counter

thresholdsCheckpoints2 = pd.DataFrame(index = chosenQuestions, columns = ['threshold', 'count'], data = 15)

for criterion in chosenQuestions:
    ys = plotCheckpointsFromThreshold(dataT, criterion, saveFig=False)
    thresholdsCheckpoints2.loc[criterion, 'threshold'] = getMostFrequentThreshold(ys)[0]
    thresholdsCheckpoints2.loc[criterion, 'count'] = getMostFrequentThreshold(ys)[1]

In [ ]:
thresholdsCheckpoints2

In [ ]:
#for criterion in criteria:
criterion = 'scoreposttest'
x = data.loc["maxChapter",:].values
y = data.loc[criterion,:].values

plt.figure(figsize=(6, 6))
ax1 = plt.subplot(111)
plt.scatter(x, y)#, c='blue', alpha=0.5)
plt.xlabel('max. checkpoint')
plt.ylabel("posttest score")
plt.title("Posttest score against max. checkpoint, n=" + str(len(x)))
plt.show()

In [ ]:
sns.regplot(x=x, y=y, color="b", x_estimator=np.mean)
plt.xlabel("max. checkpoint")
plt.ylabel("posttest score")
plt.title("Posttest score against max. checkpoint, n=" + str(len(x)))

In [ ]:
#import patsy
import statsmodels.formula.api as smf

In [ ]:
dataT = data.T.astype(float)

In [ ]:
### STATSMODELS ###

scoreCheckpointformula = criterion + ' ~ maxChapter'

# create a fitted model
lm1 = smf.ols(formula=scoreCheckpointformula, data=dataT).fit()

# print the coefficients
#lm1.params


#lm1.summary()

In [ ]:
# print the confidence intervals for the model coefficients
lm1.conf_int()

In [ ]:
# print the p-values for the model coefficients
# Represents the probability that the coefficient is actually zero
lm1.pvalues

In [ ]:
# print the R-squared value for the model
lm1.rsquared

In [ ]:
from scipy import optimize

#x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13, 14, 15], dtype=float)
#y = np.array([5, 7, 9, 11, 13, 15, 28.92, 42.81, 56.7, 70.59, 84.47, 98.36, 112.25, 126.14, 140.03])

def piecewise_linear(x, x0, y0, k1, k2):
    return np.piecewise(x, [x < x0], [lambda x:k1*x + y0-k1*x0, lambda x:k2*x + y0-k2*x0])

p , e = optimize.curve_fit(piecewise_linear, x, y)
xd = np.linspace(0, 14, 100)
plt.plot(x, y, "o")
plt.plot(xd, piecewise_linear(xd, *p))
plt.xlabel("max. checkpoint")
plt.ylabel("posttest score")
# piecewise regression
plt.title("Posttest score against max. checkpoint, segmented regression")

5. Game map

Player filtering


In [ ]:
#players = rmdf.loc[:, playerFilteringColumns]
players = safeGetNormalizedRedMetricsCSV( rmdf )
players.shape

In [ ]:
#players = players.dropna(how='any')
#players.head(1)
#rmdf.head(1)

In [ ]:
players.shape[0]

In [ ]:
#players = players[~players['userId'].isin(excludedIDs)];
#players.shape[0]

Sessions (filtered)


In [ ]:
sessionscount = players["sessionId"].nunique()
sessionscount

Sessions of dev IDs


In [ ]:

Unique players


In [ ]:
uniqueplayers = players['userId']
uniqueplayers = uniqueplayers.unique()
uniqueplayers.shape[0]

In [ ]:
#uniqueplayers

Unique platforms


In [ ]:
uniqueplatforms = players['customData.platform'].unique()
uniqueplatforms

Checkpoints passed / furthest checkpoint (unfiltered)


In [ ]:
checkpoints = rmdf.loc[:, ['type', 'section', 'sessionId']]
checkpoints = checkpoints[checkpoints['type']=='reach'].loc[:,['section','sessionId']]
checkpoints = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
checkpoints = checkpoints.groupby("sessionId")
checkpoints = checkpoints.max()
#len(checkpoints)
checkpoints.head()

In [ ]:
maxCheckpointTable = pd.DataFrame({"maxCheckpoint" : checkpoints.values.flatten()})
maxCheckpointCounts = maxCheckpointTable["maxCheckpoint"].value_counts()
maxCheckpointCounts['Start'] = None
maxCheckpointCounts = maxCheckpointCounts.sort_index()
print('\nmaxCheckpointCounts=\n{0}'.format(str(maxCheckpointCounts)))

In [ ]:
maxCheckpointCountsTable = pd.DataFrame({"maxCheckpoint" : maxCheckpointCounts.values})
maxCheckpointCountsTableCount = maxCheckpointCountsTable.sum(0)[0]
maxCheckpointCountsTableCount

In [ ]:
checkpoints.count()

In [ ]:
maxCheckpointCountsTable.head()

In [ ]:
maxCheckpointCountsTable.describe()

In [ ]:
genericTreatment( maxCheckpointCountsTable, "best checkpoint reached", "game sessions", 0, maxCheckpointCountsTableCount, False, True )

Session starts


In [ ]:
#starts = rmdf.loc[:, checkpointsRelevantColumns]
#starts = checkpoints[checkpoints['type']=='start'].loc[:,['playerId']]
#starts = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
#starts = checkpoints.groupby("playerId")
#starts = checkpoints.max()
#starts.head()

In [ ]:
startTutorial1Count = sessionscount
neverReachedGameSessionCount = startTutorial1Count - maxCheckpointCountsTableCount
fullMaxCheckpointCounts = maxCheckpointCounts
fullMaxCheckpointCounts['Start'] = neverReachedGameSessionCount
fullMaxCheckpointCountsTable = pd.DataFrame({"fullMaxCheckpoint" : fullMaxCheckpointCounts.values})

genericTreatment( fullMaxCheckpointCountsTable, "best checkpoint reached", "game sessions", 0, startTutorial1Count, False, True )

print('\nfullMaxCheckpointCountsTable=\n{0}'.format(fullMaxCheckpointCountsTable))
fullMaxCheckpointCountsTable.describe()

Duration

Duration of playing sessions


In [ ]:
durations = players.groupby("sessionId").agg({ "serverTime": [ np.min, np.max  ] })
durations["duration"] = pd.to_datetime(durations["serverTime"]["amax"]) - pd.to_datetime(durations["serverTime"]["amin"])
durations["duration"] = durations["duration"].map(lambda x: np.timedelta64(x, 's'))
durations = durations.sort_values(by=['duration'], ascending=[False])
durations.head()

Duration plot


In [ ]:
type(durations)

In [ ]:
#durations.loc[:,'duration']
#durations = durations[4:]
durations["duration_seconds"] = durations["duration"].map(lambda x: pd.Timedelta(x).seconds)
maxDuration = np.max(durations["duration_seconds"])
durations["duration_rank"] = durations["duration_seconds"].rank(ascending=False)
ax = durations.plot(x="duration_rank", y="duration_seconds")
plt.xlabel("game session")
plt.ylabel("time played (s)")
#plt.legend('')
ax.legend_.remove()
plt.xlim(0, sessionscount)
plt.ylim(0, maxDuration)
durations["duration_seconds"].describe()
#durations.head()

Phase 1 vs Phase 2 comparison

Completion rate


In [ ]:
getCompletedRate(rmdfPlaytestTotalPretestPosttestUniqueProfilesVolunteers),\
getCompletedRate(rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers),\
getCompletedRate(rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers),\

In [ ]:
rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers[QUserId].nunique(),\
rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers[QUserId].nunique()

In [ ]:
getCompletedRate(rmdfWebgl1522Timed),\
getCompletedRate(rmdfWebgl160Timed)

In [ ]:
### Scores

scoresPhase1 = allDataPlaytestPhase1PretestPosttestUniqueProfiles.loc['scoreposttest',:]
scoresPhase2 = allDataPlaytestPhase2PretestPosttestUniqueProfiles.loc['scoreposttest',:]

ttest = ttest_ind(scoresPhase1, scoresPhase2)
ttest
print("t test: statistic=" + repr(ttest.statistic) + " pvalue=" + repr(ttest.pvalue))

scoresPhase1 = allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.loc['scoreposttest',:]
scoresPhase2 = allDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers.loc['scoreposttest',:]

ttest = ttest_ind(scoresPhase1, scoresPhase2)
ttest
print("t test: statistic=" + repr(ttest.statistic) + " pvalue=" + repr(ttest.pvalue))

In [ ]:
nbs = ["{0:0=2d}".format(i) for i in range(0,15)]
completions = ['ch' + nb + 'completion' for nb in nbs]
totals = ['ch' + nb + 'total' for nb in nbs]
timeLabels = ['totalTime', 'completionTime'] + completions + totals
for timeLabel in timeLabels:
    timesPhase1 = allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.loc[timeLabel,:]
    timesPhase2 = allDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers.loc[timeLabel,:]

    ttest = ttest_ind(timesPhase1, timesPhase2)
    ttest
    print(timeLabel + " t test: statistic=" + repr(ttest.statistic) + " pvalue=" + repr(ttest.pvalue))

In [ ]:
allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.index.tolist()

Played time on critical checkpoints

Best players


In [ ]:
getRecordPlayer(rmdf1522, gform)

In [ ]:
getRecordPlayer(rmdf160, gform)