In [ ]:
%run "../Functions/8. RM-GF correlations.ipynb"
%run "../Functions/Plot.ipynb"
In [ ]:
### Online 1.52.2
#gfdf = gfdfWebgl1522PretestPosttestUniqueProfilesVolunteers.copy()
#rmdf = rmdfWebgl1522PretestPosttestUniqueProfilesVolunteers.copy()
### Playtest
#gfdf = gfdfPlaytestTotalPretestPosttestUniqueProfilesVolunteers.copy()
#gfdf = gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()
#gfdf = gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers.copy()
#rmdf = rmdfPlaytestTotalPretestPosttestUniqueProfilesVolunteers.copy()
#rmdf = rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()
#rmdf = rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers.copy()
### Online 1.60
#gfdf = gfdfWebgl160PretestPosttestUniqueProfilesVolunteers.copy()
#rmdf = rmdfWebgl160PretestPosttestUniqueProfilesVolunteers.copy()
In [ ]:
# For quicker allData switching.
gfdf = gfdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()
rmdf = rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()
allData = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()
In [ ]:
# For quicker allData switching.
#gfdf = gfdfWebgl1522Timed.copy()
#rmdf = rmdfWebgl1522Timed.copy()
In [ ]:
print("sample: gform")
print("surveys: %s" % len(gform))
print("unique users: %s" % getUniqueUserCount(gform))
print("RM before: %s" % len(gform[gform[QTemporality] == answerTemporalities[0]]))
print("GF before: %s" % len(getGFormBefores(gform)))
print("RM after: %s" % len(gform[gform[QTemporality] == answerTemporalities[1]]))
print("GF after: %s" % len(getGFormAfters(gform)))
print("unique biologists: %s" % getUniqueUserCount(getSurveysOfBiologists(gform)))
print("unique gamers: %s" % getUniqueUserCount(getSurveysOfGamers(gform)))
print("unique perfect users: %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gform)))
print("unique perfect users: %s" % getPerfectPretestPostestPairsCount(gform))
In [ ]:
print("sample: gfdf")
print("surveys: %s" % len(gfdf))
print("unique users: %s" % getUniqueUserCount(gfdf))
print("RM before: %s" % len(gfdf[gfdf[QTemporality] == answerTemporalities[0]]))
print("GF before: %s" % len(getGFormBefores(gfdf)))
print("RM after: %s" % len(gfdf[gfdf[QTemporality] == answerTemporalities[1]]))
print("GF after: %s" % len(getGFormAfters(gfdf)))
print("unique biologists: %s" % getUniqueUserCount(getSurveysOfBiologists(gfdf)))
print("unique gamers: %s" % getUniqueUserCount(getSurveysOfGamers(gfdf)))
print("unique perfect users: %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gfdf)))
print("unique perfect users: %s" % getPerfectPretestPostestPairsCount(gfdf))
In [ ]:
print("category | count")
print("--- | ---")
print("sample | gform")
print("surveys | %s" % len(gform))
print("unique users | %s" % getUniqueUserCount(gform))
print("RM before | %s" % len(gform[gform[QTemporality] == answerTemporalities[0]]))
print("GF before | %s" % len(getGFormBefores(gform)))
print("RM after | %s" % len(gform[gform[QTemporality] == answerTemporalities[1]]))
print("GF after | %s" % len(getGFormAfters(gform)))
print("unique biologists | %s" % getUniqueUserCount(getSurveysOfBiologists(gform)))
print("unique gamers | %s" % getUniqueUserCount(getSurveysOfGamers(gform)))
print("unique perfect users | %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gform)))
print("unique perfect users | %s" % getPerfectPretestPostestPairsCount(gform))
print()
#print("(" + str(pd.to_datetime('today').date()) + ")")
print("("+dataFilesNamesStem+")")
In [ ]:
print("category | count")
print("--- | ---")
print("sample | gfdf")
print("surveys | %s" % len(gfdf))
print("unique users | %s" % getUniqueUserCount(gfdf))
print("RM before | %s" % len(gfdf[gfdf[QTemporality] == answerTemporalities[0]]))
print("GF before | %s" % len(getGFormBefores(gfdf)))
print("RM after | %s" % len(gfdf[gfdf[QTemporality] == answerTemporalities[1]]))
print("GF after | %s" % len(getGFormAfters(gfdf)))
print("unique biologists | %s" % getUniqueUserCount(getSurveysOfBiologists(gfdf)))
print("unique gamers | %s" % getUniqueUserCount(getSurveysOfGamers(gfdf)))
print("unique perfect users | %s" % getUniqueUserCount(getSurveysOfUsersWhoAnsweredBoth(gfdf)))
print("unique perfect users | %s" % getPerfectPretestPostestPairsCount(gfdf))
print()
#print("(" + str(pd.to_datetime('today').date()) + ")")
print("("+dataFilesNamesStem+")")
In [ ]:
#plotSamples(getDemographicSamples(gfdf))
In [ ]:
#plotSamples(getTemporalitySamples(gfdf))
In [ ]:
gf_befores = getGFormBefores(gfdf)
rm_befores = getRMBefores(gfdf)
gfrm_befores = getRMBefores(getGFormBefores(gfdf))
In [ ]:
(gf_befores[QUserId] == rm_befores[QUserId]).all()
In [ ]:
#plotSamples(getDemographicSamples(gf_befores))
In [ ]:
gf_afters = getGFormAfters(gfdf)
rm_afters = getRMAfters(gfdf)
gfrm_afters = getRMAfters(getGFormBefores(gfdf))
In [ ]:
(gf_afters[QUserId] == rm_afters[QUserId]).all()
In [ ]:
#plotSamples(getDemographicSamples(gf_afters))
In [ ]:
gf_both = getSurveysOfUsersWhoAnsweredBoth(gfdf, gfMode = True, rmMode = False)
rm_both = getSurveysOfUsersWhoAnsweredBoth(gfdf, gfMode = False, rmMode = True)
gfrm_both = getSurveysOfUsersWhoAnsweredBoth(gfdf, gfMode = True, rmMode = True)
In [ ]:
#plotSamples(getDemographicSamples(gf_both))
In [ ]:
#plotSamples(getDemographicSamples(rm_both))
In [ ]:
#plotSamples(getDemographicSamples(gfrm_both))
In [ ]:
matrixToDisplay = plotBasicStats(
gfdf,
horizontalPlot=False,
sortedAlong="",
figsize=(12,20),
title = 'percentages of correct answers',
annot=True,
annot_kws={"size": 13},
font_scale=1.3,
);
In [ ]:
matrixToDisplay = plotBasicStats(
gfdf,
title = 'percentages of correct answers (sorted)',
sortedAlong="progression",
horizontalPlot=False,
figsize=(12,20),
annot=True,
annot_kws={"size": 13},
font_scale=1.3,
);
In [ ]:
if False:
#barIndices = matrixToDisplay.index
barIndices = scientificQuestions
matrixToDisplaySubset = matrixToDisplay.loc[scientificQuestions,:]
pretestBars = matrixToDisplaySubset.loc[barIndices, 'pretest']
posttestBars = matrixToDisplaySubset.loc[barIndices, 'posttest']
plt.bar(np.arange(len(barIndices)), pretestBars)
In [ ]:
if False:
# data to plot
#barIndices = matrixToDisplay.index
barIndices = scientificQuestions
matrixToDisplaySubset = matrixToDisplay.loc[scientificQuestions,:]
pretestBars = matrixToDisplaySubset.loc[barIndices, 'pretest']
posttestBars = matrixToDisplaySubset.loc[barIndices, 'posttest']
n_groups = len(barIndices)
# create plot
fig, ax = plt.subplots(figsize=(15,7))
index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.8
rects1 = plt.bar(index, pretestBars, bar_width,
alpha=opacity,
color='b',
label='pretest')
rects2 = plt.bar(index + bar_width, posttestBars, bar_width,
alpha=opacity,
color='g',
label='posttest')
plt.xlabel('questions')
plt.ylabel('percentage correct')
#plt.title('Percentage correct - pretest and posttest')
#plt.xticks(index + bar_width, barIndices, rotation='vertical')
plt.xticks(index + bar_width, questionCategories, rotation='vertical')
plt.legend()
plt.tight_layout()
plt.show()
In [ ]:
barIndicesHardcoded = [
'Device: PCONS:GFP:RBS:TER XXX', 'Device: PBAD:GFP:RBS:TER XXX',
'Device: AMPR:RBS:PCONS:TER XXX', 'Device: GFP:RBS:PCONS:TER XXX',
'Device: RBS:PCONS:AMPR:TER XXX', 'Device: RBS:PCONS:FLHDC:TER XXX',
'Function - game: CDS', 'Function: PR', 'Function - biology: CDS',
'Example: CDS', 'Function: Plasmid', 'Function: TER', 'Function: RBS',
'Name: Operator XXX', 'Name: RBS', 'Name: CDS', 'Name: PR',
'Name: Plasmid', 'Name: TER',
'Device: PCONS:RBS:FLHDC:TER', 'Device: PBAD:RBS:ARA:TER', 'Device: PBAD:RBS:GFP:TER',
'Unequip the movement device: effect',
'BioBricks and devices composition', 'Green fluorescence',
'Ampicillin antibiotic', 'Genotype and phenotype',]
In [ ]:
if True:
# data to plot
#barIndices = matrixToDisplay.index
barIndices = scientificQuestions
matrixToDisplaySubset = matrixToDisplay.loc[scientificQuestions,:]
matrixToDisplaySubset['qCategory'] = questionCategories
matrixToDisplaySubset = matrixToDisplaySubset.sort_values(by=['qCategory', 'posttest'])
barIndices = matrixToDisplaySubset.index
barIndices = barIndicesHardcoded
matrixToDisplaySubset = matrixToDisplaySubset.loc[barIndices,:]
barIndices = matrixToDisplaySubset.index
pretestBars = matrixToDisplaySubset.loc[barIndices, 'pretest']
posttestBars = matrixToDisplaySubset.loc[barIndices, 'posttest']
n_groups = len(barIndices)
# create plot
fig, ax = plt.subplots(figsize=(15,7))
index = np.arange(n_groups)
bar_width = 0.35
opacity = 0.8
rects1 = plt.bar(index, pretestBars, bar_width,
alpha=opacity,
color='b',
label='pretest')
rects2 = plt.bar(index + bar_width, posttestBars, bar_width,
alpha=opacity,
color='g',
label='posttest')
#plt.xlabel('questions')
plt.ylabel('Correct answers')
#plt.title('Percentage correct - pretest and posttest')
#plt.xticks(index + bar_width, barIndices.map(questionCategoriesDictionary), rotation='vertical')
plt.xticks(index + bar_width, barIndices, rotation='vertical')
#plt.xticks(index + bar_width, questionCategories, rotation='vertical')
plt.legend()
plt.tight_layout()
plt.show()
In [ ]:
if True:
# data to plot
#barIndices = matrixToDisplay.index
barIndices = scientificQuestions
matrixToDisplaySubset = matrixToDisplay.loc[scientificQuestions,:]
matrixToDisplaySubset['qCategory'] = questionCategories
matrixToDisplaySubset = matrixToDisplaySubset.sort_values(by=['qCategory', 'posttest'])
barIndices = matrixToDisplaySubset.index
for qCategory in set(questionCategories):
questionsOfThatCategory = [k for k, v in questionCategoriesDictionary.items() if v == qCategory]
barIndices = questionsOfThatCategory
bars = matrixToDisplaySubset.loc[barIndices,:].sort_values(by=['posttest'], ascending=False)
barIndices = bars.index
pretestBars = bars['pretest']
posttestBars = bars['posttest']
n_groups = len(barIndices)
# create plot
fig, ax = plt.subplots(figsize=(12, n_groups*2))
plt.xlim(0,85)
index = np.arange(len(questionsOfThatCategory))
bar_width = 0.35
opacity = 0.8
rects1 = plt.barh(index + bar_width, pretestBars, bar_width,
alpha=opacity,
color='b',
label='pretest')
rects2 = plt.barh(index, posttestBars, bar_width,
alpha=opacity,
color='g',
label='posttest')
plt.xlabel('correct answers (%)')
#plt.yticks(index + bar_width, barIndices)
plt.legend()
plt.tight_layout()
plt.show()
fig.savefig("score pretest posttest h big nolabel " + qCategory)
In [ ]:
#matrixToDisplay.to_csv("../../data/sortedPrePostProgression.csv")
In [ ]:
#matrixToDisplay.T
In [ ]:
cohortEN = gfdf[gfdf[QLanguage] == enLanguageID]
In [ ]:
#plotSamples(getTemporalitySamples(cohortEN))
In [ ]:
cohortFR = gfdf[gfdf[QLanguage] == frLanguageID]
In [ ]:
#plotSamples(getTemporalitySamples(cohortFR))
In [ ]:
cohortF = gfdf[gfdf[QGender] == 'Female']
In [ ]:
#plotSamples(getTemporalitySamples(cohortF))
In [ ]:
cohortM = gfdf[gfdf[QGender] == 'Male']
In [ ]:
#plotSamples(getTemporalitySamples(cohortM))
In [ ]:
cohortBioS = getSurveysOfBiologists(gfdf)
In [ ]:
#plotSamples(getTemporalitySamples(cohortBioS))
In [ ]:
cohortBioB = getSurveysOfBiologists(gfdf, False)
In [ ]:
#plotSamples(getTemporalitySamples(cohortBioB))
In [ ]:
cohortGamS = getSurveysOfGamers(gfdf)
In [ ]:
#plotSamples(getTemporalitySamples(cohortGamS))
In [ ]:
cohortGamB = getSurveysOfGamers(gfdf, False)
In [ ]:
#plotSamples(getTemporalitySamples(cohortGamB))
In [ ]:
#T-tests between pretest and posttest scores among some player groups
plotBasicStats(gfdf, horizontalPlot=True, sortedAlong="progression", figsize=(20,4));
plotBasicStats(cohortF, horizontalPlot=True, sortedAlong="progression", figsize=(20,4));
plotBasicStats(cohortM, horizontalPlot=True, sortedAlong="progression", figsize=(20,4));
plotBasicStats(cohortGamB, horizontalPlot=True, sortedAlong="progression", figsize=(20,4));
In [ ]:
sciBinarizedBefore = getAllBinarized(getRMBefores(gfdf))
#sciBinarizedBefore = getAllBinarized(getGFBefores())
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
sciBinarizedBefore,
_abs=False,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlations on survey questions before',
)
thisClustermap, overlay = plotCorrelationMatrix( sciBinarizedBefore, _abs=True, _clustered=True, _questionNumbers=True, _annot = True, _figsize = (20,20), _metric='correlation' )
In [ ]:
sciBinarizedAfter = getAllBinarized(getRMAfters(gfdf))
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
sciBinarizedAfter,
_abs=False,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlations on survey questions after',
)
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
thisClustermap, overlay = plotCorrelationMatrix(
sciBinarizedAfter,
_abs=False,
_clustered=True,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_metric='correlation'
)
thisClustermap.ax_heatmap.annotate(overlay)
dir(thisClustermap)
dir(thisClustermap.ax_heatmap)
vars(thisClustermap)
vars(thisClustermap.ax_heatmap)
In [ ]:
allQuestions = correctAnswers + demographicAnswers
allBinarized = getAllBinarized(gfdf, _source = allQuestions)
allBinarizedBefore = getAllBinarized(getRMBefores(gfdf), _source = allQuestions)
allBinarizedAfter = getAllBinarized(getRMAfters(gfdf), _source = allQuestions)
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
allBinarized,
_abs=True,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlation of all answers',
)
thisClustermap, overlay = plotCorrelationMatrix( allBinarizedAfter, _abs=True, _clustered=True, _questionNumbers=True, _annot = True, _figsize = (20,20), _metric='correlation' )
In [ ]:
#plotCorrelationMatrix( _binarizedMatrix, _title='Questions\' Correlations', _abs=False, _clustered=False, _questionNumbers=False ):
plotCorrelationMatrix(
allBinarizedBefore,
_abs=False,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlations on all questions before',
)
thisClustermap, overlay = plotCorrelationMatrix( allBinarizedBefore, _abs=True, _clustered=True, _questionNumbers=True, _annot = True, _figsize = (20,20), _metric='correlation' )
In [ ]:
plotCorrelationMatrix(
allBinarizedAfter,
_abs=False,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlation of all answers after',
)
In [ ]:
allBinarizedAfterSub = allBinarizedAfter.copy()
allBinarizedAfterSub = allBinarizedAfterSub.loc[:,['Age'] + scientificQuestions]
In [ ]:
plotCorrelationMatrix(
allBinarizedAfterSub,
_abs=False,
_clustered=False,
_questionNumbers=True,
_annot = True,
_figsize = (20,20),
_title='Correlation of all answers after',
)
In [ ]:
In [ ]:
#startDate = minimum152Date
#endDate = maximum152Date
startDate = rmdf['userTime'].min().date() - datetime.timedelta(days=1)
endDate = rmdf['userTime'].max().date() + datetime.timedelta(days=1)
In [ ]:
valuesPerDay = rmdf['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='RedMetrics events', startDate=startDate, endDate=endDate)
In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]
In [ ]:
valuesPerDay = rmdf[rmdf['type'] == 'start']['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='sessions', startDate=startDate, endDate=endDate)
In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]
In [ ]:
valuesPerDay = rmdf.groupby('userId').agg({ "userTime": np.min })['userTime'].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='game users', startDate=startDate, endDate=endDate)
In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]
In [ ]:
valuesPerDay = gfdf.groupby(localplayerguidkey).agg({ QTimestamp: np.min })[QTimestamp].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(valuesPerDay, title='survey answers', startDate=startDate, endDate=endDate)
In [ ]:
valuesPerDay[pd.to_datetime('2017-09-01', utc=True).date():pd.to_datetime('2017-09-30', utc=True).date()]
In [ ]:
beforesPerDay = gfdf[gfdf[QTemporality] == answerTemporalities[0]].groupby(localplayerguidkey).agg({ QTimestamp: np.min })[QTimestamp].map(lambda t: t.date()).value_counts().sort_index()
aftersPerDay = gfdf[gfdf[QTemporality] == answerTemporalities[1]].groupby(localplayerguidkey).agg({ QTimestamp: np.min })[QTimestamp].map(lambda t: t.date()).value_counts().sort_index()
undefinedPerDay = gfdf[gfdf[QTemporality] == answerTemporalities[2]].groupby(localplayerguidkey).agg({ QTimestamp: np.min })[QTimestamp].map(lambda t: t.date()).value_counts().sort_index()
plotPerDay(beforesPerDay, title='survey befores', startDate=startDate, endDate=endDate)
plotPerDay(aftersPerDay, title='survey afters', startDate=startDate, endDate=endDate)
plotPerDay(undefinedPerDay, title='survey undefined', startDate=startDate, endDate=endDate)
In [ ]:
to do: transfer part of 1.3's "'Google form analysis' functions tinkering" code here
In [ ]:
#pretests = gform[gform[QTemporality] == answerTemporalities[0]]
#pretests[pretests[QBBFunctionPlasmid] == ]
In [ ]:
binarized = sciBinarizedBefore
intermediaryNumerator = getCrossCorrectAnswers(binarized).round().astype(int)*100
percentagesCrossCorrect = (intermediaryNumerator / binarized.shape[0]).round().astype(int)
totalPerQuestion = np.dot(np.ones(binarized.shape[0]), binarized)
sciBinarizedBefore.columns[totalPerQuestion == 0]
In [ ]:
getPercentageCrossCorrect(sciBinarizedBefore, figsize=(40,40))
In [ ]:
getPercentageCrossCorrect(sciBinarizedAfter, figsize=(40,40))
In [ ]:
len(gfdf), len(getAllResponders(gfdf))
In [ ]:
matrixToDisplay = plotBasicStats(gfdf, horizontalPlot=True, sortedAlong="progression", figsize=(20,4));
In [ ]:
subjectCount = allData.shape[1]
measuredPretest = 100*allData.loc[pretestScientificQuestions,:].sum(axis='columns')/subjectCount
measuredPretest.index = scientificQuestions
measuredPosttest = 100*allData.loc[posttestScientificQuestions,:].sum(axis='columns')/subjectCount
measuredPosttest.index = scientificQuestions
measuredDelta2 = (measuredPosttest - measuredPretest)
measuredDelta2 = pd.DataFrame(measuredDelta2.round().astype(int))
measuredDelta2.columns = ["measuredDelta2"]
measuredDelta2 = measuredDelta2.sort_values(by = "measuredDelta2", ascending = True).T
_fig = plt.figure(figsize=(20,2))
_ax1 = plt.subplot(111)
_ax1.set_title("measuredDelta2")
sns.heatmap(
measuredDelta2,
ax=_ax1,
cmap=plt.cm.jet,
square=True,
annot=True,
fmt='d',
vmin=0,
vmax=100,
)
In [ ]:
#(matrixToDisplay.loc['progression',scientificQuestions] - measuredDelta2.loc['measuredDelta2',scientificQuestions])
In [ ]:
testDF = pd.DataFrame(columns=[
'pretest1', 'posttest1', 'measuredDelta',
'pretest2', 'posttest2', 'matrixToDisplay'], data = 0, index= scientificQuestions)
testDF['pretest1'] = measuredPretest
testDF['posttest1'] = measuredPosttest
testDF['measuredDelta'] = measuredDelta2.T['measuredDelta2']
testDF['pretest2'] = matrixToDisplay.T['pretest'][scientificQuestions]
testDF['posttest2'] = matrixToDisplay.T['posttest'][scientificQuestions]
testDF['matrixToDisplay'] = matrixToDisplay.T['progression'][scientificQuestions]
testDF = testDF.round().astype(int)
#testDF
In [ ]:
measuredDelta = allData.loc[deltaScientificQuestions,:].sum(axis='columns')
measuredDelta.mean(), measuredDelta.median()
#measuredDelta.sort_values()
In [ ]:
In [ ]:
#pretestData = getAllUserVectorData( gfdf[gfdf[QTemporality] == answerTemporalities[0]], _source = correctAnswers )
#posttestData = getAllUserVectorData( gfdf[gfdf[QTemporality] == answerTemporalities[1]], _source = correctAnswers )
In [ ]:
plotAllUserVectorDataCorrelationMatrix(
allData.T,
_abs=False,
_figsize = (40,40),
_clustered=False
)
In [ ]:
demographicCriteria = demographicQuestions.copy()
plotAllUserVectorDataCorrelationMatrix(
allData.T,
_abs=False,
_figsize = (20,20),
_clustered=False,
columnSubset=[]\
+ completionTimesCriteria
+ totalTimesCriteria
+ pretestScientificQuestions
#+ posttestScientificQuestions
#+ deltaScientificQuestions
+ overallScoreCriteria
#+ demographicCriteria
)
In [ ]:
#completers = rmdf[rmdf['type'] == 'complete'][QUserId]
#nonCompleter = rmdf[~rmdf[QUserId].isin(completers)][QUserId].iloc[0]
In [ ]:
#getUserDataVector(nonCompleter)#.loc[14,:]
In [ ]:
#allData.shape
In [ ]:
#allData.index
In [ ]:
data = pd.DataFrame(index=allData.columns, columns=["time", "posttestScore", "deltaScore","completed"])
for userId in data.index:
data.loc[userId, "time"] = getPlayedTimeUser(userId, _rmDF = rmdf)['tutorial']['totalSpentTime'].total_seconds()
data.loc[userId, "posttestScore"] = allData.loc['scoreposttest', userId]
data.loc[userId, "pretestScore"] = allData.loc['scorepretest', userId]
data.loc[userId, "deltaScore"] = allData.loc['scoredelta', userId]
data.loc[userId, "completed"] = allData.loc['complete', userId]
data.shape
x = allScores.copy() x2 = completedScores.copy() y = allPlayedTimes.copy() y2 = completedPlayedTimes.copy()
In [ ]:
x = data["posttestScore"]
x2 = data[data["completed"]==1]["posttestScore"]
y = data["time"]
y2 = data[data["completed"]==1]["time"]
plt.figure(figsize=(12, 4))
ax1 = plt.subplot(121)
plt.scatter(x, y)#, c='blue', alpha=0.5)
plt.scatter(x2, y2)#, c='red', alpha=0.5)
plt.xlabel('score')
plt.ylabel('time')
plt.title("time against score, n=" + str(len(x)))
#ax1.legend(loc='center left', bbox_to_anchor=(1, 0.5))
ax2 = plt.subplot(122)
plt.scatter(y, x)
plt.scatter(y2, x2)
plt.xlabel('time')
plt.ylabel('score')
plt.title("score against time, n=" + str(len(x)))
ax2.legend(loc='center left', bbox_to_anchor=(-1.2, 0.9), labels =["unfinished games","completed games"])
plt.show()
In [ ]:
x = data["posttestScore"].astype(float)
x2 = data[data["completed"]==1]["posttestScore"].astype(float)
y = data["time"].astype(float)
y2 = data[data["completed"]==1]["time"].astype(float)
# Get the linear models
lm_original = np.polyfit(x, y, 1)
# calculate the y values based on the co-efficients from the model
r_x, r_y = zip(*((i, i*lm_original[0] + lm_original[1]) for i in x))
# Put in to a data frame, to keep is all nice
lm_original_plot = pd.DataFrame({
'scores' : r_x,
'times' : r_y
})
lm_original_plot = lm_original_plot.drop_duplicates()
lm_original_plot = lm_original_plot.sort_values(by="scores")
lm_original_plot = lm_original_plot.drop(lm_original_plot.index[1:-1])
In [ ]:
plt.figure(figsize=(6, 4))
ax = plt.subplot(111)
plt.scatter(x, y)
plt.scatter(x2, y2)
# Plot the original data and model
#lm_original_plot.plot(kind='line', color='Red', x='scores', y='times', ax=ax)
plt.plot('scores', 'times', data=lm_original_plot, color='Red')
plt.xlabel('score')
plt.ylabel('time')
plt.show()
In [ ]:
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
x = data["posttestScore"].astype(float)
x2 = data[data["completed"]==1]["posttestScore"].astype(float)
y = data["time"].astype(float)
y2 = data[data["completed"]==1]["time"].astype(float)
xReshaped = x.values.reshape(-1, 1)
# Create linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(xReshaped, y)
# Make predictions using the testing set
pred = regr.predict(xReshaped)
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f"
% mean_squared_error(y, pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y, pred))
# Plot outputs
plt.scatter(x, y, color='black')
plt.plot(x, pred, color='blue', linewidth=3)
plt.xticks(())
plt.yticks(())
plt.show()
In [ ]:
regr.intercept_,regr.coef_
In [ ]:
sns.regplot(x=x, y=y, color="b")
plt.scatter(x2, y2, color='red')
plt.xlabel("score")
plt.ylabel("time played")
data = pd.DataFrame(index = range(0, len(xReshaped)), data = xReshaped, columns = ['score'])
data['time'] = y.values
data
data2 = data.loc[:, ["time", "posttestScore"]] data2.index = range(0, data.shape[0])
data2
In [ ]:
#import patsy
import statsmodels.formula.api as smf
In [ ]:
data2 = data.astype(float)
In [ ]:
### STATSMODELS ###
timeScoreformula = 'time ~ posttestScore'
# create a fitted model
lm1 = smf.ols(formula=timeScoreformula, data=data2).fit()
# print the coefficients
#lm1.params
#lm1.summary()
In [ ]:
# print the confidence intervals for the model coefficients
lm1.conf_int()
In [ ]:
# print the p-values for the model coefficients
# Represents the probability that the coefficient is actually zero
lm1.pvalues
In [ ]:
# print the R-squared value for the model
lm1.rsquared
In [ ]:
### STATSMODELS ###
timeScoreformula = 'time ~ posttestScore'
lm1 = smf.ols(formula=timeScoreformula, data=data2).fit()
lm2 = smf.ols(formula=timeScoreformula, data=data2[data2["completed"] == 0]).fit()
lm3 = smf.ols(formula=timeScoreformula, data=data2[data2["completed"] == 1]).fit()
lm1.rsquared,lm2.rsquared,lm3.rsquared
In [ ]:
data['deltaScoreRate'] = data['deltaScore']/data['pretestScore']
meanDelta = data['deltaScore'].mean()
meanPretest = data['pretestScore'].mean()
meanDelta/meanPretest
In [ ]:
overallScoreCriteria = ["scorepretest", "scoreposttest", "scoredelta",]
In [ ]:
stemTimesCriteria = ["ch" + "{0:0=2d}".format(i) for i in range(0,15)]
completionTimesCriteria = [st + "completion" for st in stemTimesCriteria] + ["completionTime"]
totalTimesCriteria = [st + "total" for st in stemTimesCriteria] + ["totalTime"]
In [ ]:
allData2 = allData.T.rename(str,axis="columns")
allData3 = allData2[allData2['ch00completion'] < pd.Timedelta.max.total_seconds()]
len(allData3)
In [ ]:
allData2[allData2[criterionLabel]>9e+09]
In [ ]:
sectionNb = '01'
criterionLabel = 'ch' + sectionNb + 'completion'
sectionName = 'tutorial.Checkpoint' + sectionNb
testUserId = allData2[allData2[criterionLabel]>9e+09].index[0]
#rmdf or rmdfConcat
_rmdf = rmdfConcat
_rmdf[(_rmdf[QUserId] == testUserId) \
& (_rmdf['type'] == 'reach') \
& (_rmdf['section'] == 'tutorial.Checkpoint' + sectionNb) \
].loc[:, ['section', 'userTime']]
In [ ]:
testUserId
In [ ]:
_rmdf[(_rmdf[QUserId] == testUserId)]
In [ ]:
gfdf[gfdf[QUserId] == testUserId]
In [ ]:
#chosenPrefix = answerTemporalities[0]
chosenPrefix = answerTemporalities[1]
#chosenPrefix = "delta"
#warning: not the same as displayed columns, see lower
chosenCriteria = [chosenPrefix + " " + q for q in scientificQuestions] + overallScoreCriteria
durationsScoresCorrelations = pd.DataFrame(index=completionTimesCriteria+totalTimesCriteria, columns=chosenCriteria, data=np.nan)
durationsScoresCorrelations = durationsScoresCorrelations.rename(str, axis='rows')
annotationMatrix = np.empty(shape=[durationsScoresCorrelations.shape[0], 1], dtype=int)
#annotationMatrix2D = np.empty(durationsScoresCorrelations.shape, dtype=str)
allData2 = allData.T.rename(str,axis="columns")
for i in range(len(durationsScoresCorrelations.index)):
checkpoint = durationsScoresCorrelations.index[i]
allData3 = allData2[allData2[checkpoint] < pd.Timedelta.max.total_seconds()]
annotationMatrix[i] = len(allData3)
for q in durationsScoresCorrelations.columns:
corr = np.corrcoef(allData3[checkpoint], allData3[q])
if corr[0,0] < 0:
print("[" + checkpoint + ";" + q + "]:" + str(corr[0,0]))
#if pd.isnull(corr[0,1]):
# print("[" + checkpoint + ";" + q + "] null")
durationsScoresCorrelations.loc[checkpoint, q] = corr[0,1]
_fig, (_a0, _a1) = plt.subplots(1,2, gridspec_kw = {'width_ratios':[50, 1]}, figsize=(15,10))
#_a0.set_title("correlations between times and " + chosenPrefix + " scores")
_a0.set_title("correlations between times and scores")
durationsScoresCorrelations.columns = [q for q in scientificQuestions] + ["pretest score", "posttest score", "score increase",]
sns.heatmap(durationsScoresCorrelations, ax=_a0, cmap=plt.cm.jet, square=True, vmin=-1, vmax=1,
# annot=True,
# annot=annotationMatrix2D
#cbar_kws= {'panchor':(0.0, 0.0)}
)
_a1.set_title("")
sns.heatmap(annotationMatrix, ax=_a1, annot=annotationMatrix)
_fig.tight_layout()
In [ ]:
#chosenPrefix = answerTemporalities[0]
#chosenPrefix = answerTemporalities[1]
#chosenPrefix = "delta"
#warning: not the same as displayed columns, see lower
#questions1 = [QAge,QGender]
#questions1 = [QEnjoyed]
#questions2 = [
# QCuriosityBiology,QCuriositySyntheticBiology,QCuriosityVideoGames,
# QCuriosityEngineering,
## QPlayed,
# QAge,QGender,
# QInterestVideoGames,
# QInterestBiology,QStudiedBiology,QPlayVideoGames,
## QHeardSynBioOrBioBricks,
## QVolunteer,
# QEnjoyed]
#questions2 = [
# QCuriosityBiology,
# QCuriositySyntheticBiology,
# QCuriosityVideoGames,
# QCuriosityEngineering,
# QPlayed,
#QAge,
#QGender,
# QInterestVideoGames,
# QInterestBiology,
# QStudiedBiology,
# QPlayVideoGames,
# QHeardSynBioOrBioBricks,
# QVolunteer,
# QEnjoyed #use only posttest value
# ]
questions2 = [
QCuriosityEngineering,
QCuriosityBiology,
QCuriositySyntheticBiology,
QInterestBiology,
QStudiedBiology,
QCuriosityVideoGames,
QInterestVideoGames,
QPlayVideoGames,
# QPlayed,
QAge,
QGender,
# QHeardSynBioOrBioBricks,
# QVolunteer,
# QEnjoyed #use only posttest value
]
#chosenCriteria1 = completionTimesCriteria+totalTimesCriteria
#chosenCriteria1 = ["posttest " + q for q in scientificQuestions] + overallScoreCriteria
chosenCriteria1 = overallScoreCriteria
#chosenCriteria1 = ["pretest " + q for q in questions1]
#chosenCriteria1 = ["posttest " + q for q in questions1]
#chosenCriteria2 = ["posttest " + q for q in questions2]
#chosenCriteria2 = ["pretest " + q for q in questions2] + ["posttest " + QEnjoyed]
#chosenCriteria2 = ["posttest " + q for q in scientificQuestions] + overallScoreCriteria
chosenCriteria2 = ["pretest " + q for q in questions2]
#chosenCriteria2 = ["maxChapter"]
criteriaScoresCorrelations = pd.DataFrame(index=chosenCriteria1, columns=chosenCriteria2, data=np.nan)
criteriaScoresCorrelations = criteriaScoresCorrelations.rename(str, axis='rows')
annotationMatrix = np.empty(shape=[criteriaScoresCorrelations.shape[0], 1], dtype=int)
#annotationMatrix2D = np.empty(durationsScoresCorrelations.shape, dtype=str)
#allData2 = allData.T.rename(str,axis="columns")
#allData2 = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers
allData2 = allNumericDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers
allData2 = allData2.T.rename(str,axis="columns")
for i in range(len(criteriaScoresCorrelations.index)):
criterion1i = criteriaScoresCorrelations.index[i]
allData3 = allData2
if criterion1i in completionTimesCriteria:
allData3 = allData2[allData2[criterion1i] < pd.Timedelta.max.total_seconds()]
annotationMatrix[i] = len(allData3)
for criterion2j in criteriaScoresCorrelations.columns:
corr = np.corrcoef(allData3[criterion1i], allData3[criterion2j])
if corr[0,0] < 0:
print("[" + criterion1i + ";" + criterion2j + "]:" + str(corr[0,0]))
#if pd.isnull(corr[0,1]):
# print("[" + criterion1i + ";" + criterion2j + "] null")
criteriaScoresCorrelations.loc[criterion1i, criterion2j] = corr[0,1]
#index 1
#criteriaScoresCorrelations.index = scientificQuestions + ["pretest score", "posttest score", "score increase"]
#criteriaScoresCorrelations.index = questions1
criteriaScoresCorrelations.index = ["pretest score", "posttest score", "score increase"]
#columns 2
criteriaScoresCorrelations.columns = questions2
#criteriaScoresCorrelations.columns = questions2 + [QEnjoyed]
#criteriaScoresCorrelations.columns = scientificQuestions + ["pretest score", "posttest score", "score increase"]
#criteriaScoresCorrelations.columns = ["max. checkpoint"]
# (10,20) big
# (12,5) small
#_fig, (_a0) = plt.subplots(1,1, figsize=(10,18))
_fig, (_a0) = plt.subplots(1,1, figsize=(6,10))
#_fig, (_a0, _a1) = plt.subplots(
# 1,2, figsize=(5,25), gridspec_kw = {'width_ratios':[15, 1]})
# 2,1, figsize=(17,12), gridspec_kw = {'height_ratios':[30, 1]})
#sns.set(font_scale=1)
#sns.set(font_scale=1.3)
sns.set(font_scale=1.7)
data = criteriaScoresCorrelations.T
#_a0.set_title("correlations between times and demographic criteria")
#_a0.set_title("correlations between scores and demographic criteria")
#_a0.set_title("correlations between (age, gender) and (curiosity, interest, practice, enjoyment)")
#_a0.set_title("correlations between enjoyment and age, gender, curiosity, interest, practice, enjoyment")
#plt.title("correlations between enjoyment and age, gender, curiosity, interest, practice")
#_a0.set_title("correlations between times and scores")
#_a0.set_title("correlations between scores and maximum checkpoint reached")
_a0.set_anchor('C')
sns.heatmap(data, ax=_a0,
#cmap=plt.cm.jet,
cmap="RdBu_r",
square=True, vmin=-1, vmax=1,
annot=True,
# cbar = False,
# annot=annotationMatrix2D
#cbar_kws= {'panchor':(0.0, 0.0)}
#cbar_kws = dict(use_gridspec=False,location="right"),
annot_kws={"size": 13},
#annot_kws={"size": 13},
)
#_a1.set_anchor('C')
#data = annotationMatrix.T
#sns.heatmap(data, ax=_a1, annot=data, square=True,
# cbar = False,xticklabels=False,yticklabels=False,annot_kws={"size": 12})
_fig.tight_layout()
In [ ]:
In [ ]:
_fig, (_a1) = plt.subplots(1,1, figsize=(10,5))
_a1.set_anchor('C')
#data = pd.Series(data=annotationMatrix.flatten(), index=completionTimesCriteria+totalTimesCriteria)
data = annotationMatrix.T
sns.heatmap(data,
ax=_a1,
annot=data,
square=True,
cbar = False,
#xticklabels=False,
xticklabels=completionTimesCriteria+totalTimesCriteria,
yticklabels=False,
#yticklabels=completionTimesCriteria+totalTimesCriteria,
annot_kws={"size": 12})
_fig.tight_layout()
In [ ]:
i = 0
checkpoint = durationsScoresCorrelations.index[i]
print(checkpoint + ": " + str(len(allData2[allData2[checkpoint] < pd.Timedelta.max.total_seconds()])))
In [ ]:
testUserId = gfdf[QUserId].unique()[12]
getCheckpointsTotalTimesUser(testUserId, rmdf)
In [ ]:
#timedSectionnedEvents.to_csv("ch4.csv", encoding=csvEncoding)
In [ ]:
#getAllResponders(gfdf), _source = correctAnswers, _rmDF = rmdf
#testUserId = "4731525f-62dd-4128-ab56-3991b403e17e"
#getUserDataVector(testUserId,_source = correctAnswers, _rmDF = rmdf)
In [ ]:
# delta or posttest?
# posttest: values 0, 1 managed in plotCorrectedAnswerPerMaxCheckpoint
# delta can't work: values 0, 1 and -1 not managed in plotCorrectedAnswerPerMaxCheckpoint
chosenPrefix = "posttest"
chosenQuestions = [chosenPrefix + " " + q for q in scientificQuestions]
criteria = ["maxChapter","complete"] + chosenQuestions + overallScoreCriteria
#data = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfiles.loc[criteria,:]
data = allBinaryDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.loc[criteria,:]
#data = allBinaryDataPlaytestPhase2PretestPosttestUniqueProfiles.loc[criteria,:]
#data = allBinaryDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers.loc[criteria,:]
#data = allNumericDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.loc[criteria,:]
In [ ]:
criterion1 = chosenPrefix + ' Function - biology: CDS'
criterion2 = chosenPrefix + ' Device: RBS:PCONS:FLHDC:TER XXX'
xIndex = 'maxChapter'
dataT = data.T
In [ ]:
def getCheckpointThreshold(bars0, bars1, thresholdRatio = .9):
totalCount = np.sum(bars0) + np.sum(bars1)
cumulative0 = np.cumsum(list(reversed(bars0)))
cumulative1 = np.cumsum(list(reversed(bars1)))
result = 0
#np.argmax(cumulative1>=thresholdCount)
for i in range(len(cumulative1)):
thresholdCount = np.floor(thresholdRatio * (cumulative0[i] + cumulative1[i]))
if cumulative1[i] < thresholdCount:
result = 15-i
break
return result
In [ ]:
from matplotlib import rc
import pandas as pd
# stacked horizontal bar plot; cf df.plot.barh?
def plotCorrectedAnswerPerMaxCheckpoint(dataT, criterion, saveFig=False, plotFig=True, thresholdRatio=0.9):
# y-axis in bold
rc('font', weight='bold')
# Values of each group
bars0 = [len(dataT[(dataT[criterion]==0) & (dataT[xIndex]==maxChapterValue)]) for maxChapterValue in range(15)]
bars1 = [len(dataT[(dataT[criterion]==1) & (dataT[xIndex]==maxChapterValue)]) for maxChapterValue in range(15)]
if plotFig:
# Heights of bars1 + bars2 (TO DO better)
bars = [bars0[i] + bars1[i] for i in range(len(bars0))]
# The position of the bars on the x-axis
r = [i for i in range(15)]
# Names of group and bar width
names = [i for i in range(15)]
barWidth = 1
fig, ax = plt.subplots(1,1, figsize=(10,6))
# Create red bars
ax.bar(r, bars0, color='#cc0c28', edgecolor='white', width=barWidth)
# Create green bars (middle), on top of the firs ones
ax.bar(r, bars1, bottom=bars0, color='#557f2d', edgecolor='white', width=barWidth)
# Custom X axis
plt.xticks(r, names, fontweight='bold')
plt.xlabel("max. checkpoint")
plt.ylabel("count")
plt.title("Answers to question '" + criterion + "' against max. checkpoint, n=" + str(len(dataT.index)))
ax.legend(["incorrect", "correct"],
bbox_to_anchor=(0.7, 0.7),
# loc="upper center",
)
# Show graphic
plt.show()
if saveFig:
#correctedAnswersPerMaxCheckpoint
questionTitle = "cAPMC-'" + criterion.replace(" ", "_").replace(":", "") + "'"
try:
fig.savefig(questionTitle)
except:
print("- savefig failed for " + questionTitle)
return [bars0, bars1, getCheckpointThreshold(bars0, bars1, thresholdRatio)]
[bars0, bars1, threshold] = plotCorrectedAnswerPerMaxCheckpoint(dataT, criterion2, saveFig=False, plotFig=True)
threshold
In [ ]:
getCheckpointThreshold(bars0, bars1, thresholdRatio = 1)
In [ ]:
np.cumsum(bars1)
In [ ]:
thresholdsCheckpoints = pd.Series(index = chosenQuestions, data = 15, name = "thresholdsCheckpoints")
for criterion in chosenQuestions:
[bars0, bars1, threshold] = plotCorrectedAnswerPerMaxCheckpoint(
dataT,
criterion,
saveFig=False,
plotFig=False,
thresholdRatio=0.8
)
thresholdsCheckpoints[criterion] = threshold
thresholdsCheckpoints
In [ ]:
thresholdsCheckpoints
In [ ]:
def plotCheckpointsFromThreshold(dataT, criterion, saveFig=False):
xs = []
ys = []
for x in np.linspace(0.5,1,11):
[bars0, bars1, thresholdCheckpoint] = plotCorrectedAnswerPerMaxCheckpoint(
dataT,
criterion,
saveFig=False,
plotFig=False,
thresholdRatio=x
)
xs += [x]
ys += [thresholdCheckpoint]
#print("x=" + str(x) +": " + str(thresholdCheckpoint))
fig = plt.figure(figsize=(12, 4))
ax1 = plt.subplot(111)
plt.plot(xs, ys)
plt.ylim((-0.5, 14.5))
plt.xlabel('threshold')
plt.ylabel('checkpoint')
plt.title("Checkpoint against threshold, for question '" + criterion + "'")
plt.show()
if saveFig:
#correctedAnswersPerMaxCheckpoint
questionTitle = "cFT-'" + criterion.replace(" ", "_").replace(":", "") + "'"
try:
fig.savefig(questionTitle)
except:
print("- savefig failed for " + questionTitle)
return ys
ys = plotCheckpointsFromThreshold(dataT, criterion2)
In [ ]:
def getMostFrequentThreshold(ys):
result = [x for x in ys if ((x != 15) & (x != 0))]
if len(result) == 0:
return Counter(ys).most_common(1)[0]
else:
return Counter(result).most_common(1)[0]
In [ ]:
from collections import Counter
thresholdsCheckpoints2 = pd.DataFrame(index = chosenQuestions, columns = ['threshold', 'count'], data = 15)
for criterion in chosenQuestions:
ys = plotCheckpointsFromThreshold(dataT, criterion, saveFig=False)
thresholdsCheckpoints2.loc[criterion, 'threshold'] = getMostFrequentThreshold(ys)[0]
thresholdsCheckpoints2.loc[criterion, 'count'] = getMostFrequentThreshold(ys)[1]
In [ ]:
thresholdsCheckpoints2
In [ ]:
#for criterion in criteria:
criterion = 'scoreposttest'
x = data.loc["maxChapter",:].values
y = data.loc[criterion,:].values
plt.figure(figsize=(6, 6))
ax1 = plt.subplot(111)
plt.scatter(x, y)#, c='blue', alpha=0.5)
plt.xlabel('max. checkpoint')
plt.ylabel("posttest score")
plt.title("Posttest score against max. checkpoint, n=" + str(len(x)))
plt.show()
In [ ]:
sns.regplot(x=x, y=y, color="b", x_estimator=np.mean)
plt.xlabel("max. checkpoint")
plt.ylabel("posttest score")
plt.title("Posttest score against max. checkpoint, n=" + str(len(x)))
In [ ]:
#import patsy
import statsmodels.formula.api as smf
In [ ]:
dataT = data.T.astype(float)
In [ ]:
### STATSMODELS ###
scoreCheckpointformula = criterion + ' ~ maxChapter'
# create a fitted model
lm1 = smf.ols(formula=scoreCheckpointformula, data=dataT).fit()
# print the coefficients
#lm1.params
#lm1.summary()
In [ ]:
# print the confidence intervals for the model coefficients
lm1.conf_int()
In [ ]:
# print the p-values for the model coefficients
# Represents the probability that the coefficient is actually zero
lm1.pvalues
In [ ]:
# print the R-squared value for the model
lm1.rsquared
In [ ]:
from scipy import optimize
#x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13, 14, 15], dtype=float)
#y = np.array([5, 7, 9, 11, 13, 15, 28.92, 42.81, 56.7, 70.59, 84.47, 98.36, 112.25, 126.14, 140.03])
def piecewise_linear(x, x0, y0, k1, k2):
return np.piecewise(x, [x < x0], [lambda x:k1*x + y0-k1*x0, lambda x:k2*x + y0-k2*x0])
p , e = optimize.curve_fit(piecewise_linear, x, y)
xd = np.linspace(0, 14, 100)
plt.plot(x, y, "o")
plt.plot(xd, piecewise_linear(xd, *p))
plt.xlabel("max. checkpoint")
plt.ylabel("posttest score")
# piecewise regression
plt.title("Posttest score against max. checkpoint, segmented regression")
In [ ]:
#players = rmdf.loc[:, playerFilteringColumns]
players = safeGetNormalizedRedMetricsCSV( rmdf )
players.shape
In [ ]:
#players = players.dropna(how='any')
#players.head(1)
#rmdf.head(1)
In [ ]:
players.shape[0]
In [ ]:
#players = players[~players['userId'].isin(excludedIDs)];
#players.shape[0]
In [ ]:
sessionscount = players["sessionId"].nunique()
sessionscount
In [ ]:
In [ ]:
uniqueplayers = players['userId']
uniqueplayers = uniqueplayers.unique()
uniqueplayers.shape[0]
In [ ]:
#uniqueplayers
In [ ]:
uniqueplatforms = players['customData.platform'].unique()
uniqueplatforms
In [ ]:
checkpoints = rmdf.loc[:, ['type', 'section', 'sessionId']]
checkpoints = checkpoints[checkpoints['type']=='reach'].loc[:,['section','sessionId']]
checkpoints = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
checkpoints = checkpoints.groupby("sessionId")
checkpoints = checkpoints.max()
#len(checkpoints)
checkpoints.head()
In [ ]:
maxCheckpointTable = pd.DataFrame({"maxCheckpoint" : checkpoints.values.flatten()})
maxCheckpointCounts = maxCheckpointTable["maxCheckpoint"].value_counts()
maxCheckpointCounts['Start'] = None
maxCheckpointCounts = maxCheckpointCounts.sort_index()
print('\nmaxCheckpointCounts=\n{0}'.format(str(maxCheckpointCounts)))
In [ ]:
maxCheckpointCountsTable = pd.DataFrame({"maxCheckpoint" : maxCheckpointCounts.values})
maxCheckpointCountsTableCount = maxCheckpointCountsTable.sum(0)[0]
maxCheckpointCountsTableCount
In [ ]:
checkpoints.count()
In [ ]:
maxCheckpointCountsTable.head()
In [ ]:
maxCheckpointCountsTable.describe()
In [ ]:
genericTreatment( maxCheckpointCountsTable, "best checkpoint reached", "game sessions", 0, maxCheckpointCountsTableCount, False, True )
In [ ]:
#starts = rmdf.loc[:, checkpointsRelevantColumns]
#starts = checkpoints[checkpoints['type']=='start'].loc[:,['playerId']]
#starts = checkpoints[checkpoints['section'].str.startswith('tutorial', na=False)]
#starts = checkpoints.groupby("playerId")
#starts = checkpoints.max()
#starts.head()
In [ ]:
startTutorial1Count = sessionscount
neverReachedGameSessionCount = startTutorial1Count - maxCheckpointCountsTableCount
fullMaxCheckpointCounts = maxCheckpointCounts
fullMaxCheckpointCounts['Start'] = neverReachedGameSessionCount
fullMaxCheckpointCountsTable = pd.DataFrame({"fullMaxCheckpoint" : fullMaxCheckpointCounts.values})
genericTreatment( fullMaxCheckpointCountsTable, "best checkpoint reached", "game sessions", 0, startTutorial1Count, False, True )
print('\nfullMaxCheckpointCountsTable=\n{0}'.format(fullMaxCheckpointCountsTable))
fullMaxCheckpointCountsTable.describe()
Duration of playing sessions
In [ ]:
durations = players.groupby("sessionId").agg({ "serverTime": [ np.min, np.max ] })
durations["duration"] = pd.to_datetime(durations["serverTime"]["amax"]) - pd.to_datetime(durations["serverTime"]["amin"])
durations["duration"] = durations["duration"].map(lambda x: np.timedelta64(x, 's'))
durations = durations.sort_values(by=['duration'], ascending=[False])
durations.head()
Duration plot
In [ ]:
type(durations)
In [ ]:
#durations.loc[:,'duration']
#durations = durations[4:]
durations["duration_seconds"] = durations["duration"].map(lambda x: pd.Timedelta(x).seconds)
maxDuration = np.max(durations["duration_seconds"])
durations["duration_rank"] = durations["duration_seconds"].rank(ascending=False)
ax = durations.plot(x="duration_rank", y="duration_seconds")
plt.xlabel("game session")
plt.ylabel("time played (s)")
#plt.legend('')
ax.legend_.remove()
plt.xlim(0, sessionscount)
plt.ylim(0, maxDuration)
durations["duration_seconds"].describe()
#durations.head()
In [ ]:
getCompletedRate(rmdfPlaytestTotalPretestPosttestUniqueProfilesVolunteers),\
getCompletedRate(rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers),\
getCompletedRate(rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers),\
In [ ]:
rmdfPlaytestPhase1PretestPosttestUniqueProfilesVolunteers[QUserId].nunique(),\
rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers[QUserId].nunique()
In [ ]:
getCompletedRate(rmdfWebgl1522Timed),\
getCompletedRate(rmdfWebgl160Timed)
In [ ]:
### Scores
scoresPhase1 = allDataPlaytestPhase1PretestPosttestUniqueProfiles.loc['scoreposttest',:]
scoresPhase2 = allDataPlaytestPhase2PretestPosttestUniqueProfiles.loc['scoreposttest',:]
ttest = ttest_ind(scoresPhase1, scoresPhase2)
ttest
print("t test: statistic=" + repr(ttest.statistic) + " pvalue=" + repr(ttest.pvalue))
scoresPhase1 = allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.loc['scoreposttest',:]
scoresPhase2 = allDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers.loc['scoreposttest',:]
ttest = ttest_ind(scoresPhase1, scoresPhase2)
ttest
print("t test: statistic=" + repr(ttest.statistic) + " pvalue=" + repr(ttest.pvalue))
In [ ]:
nbs = ["{0:0=2d}".format(i) for i in range(0,15)]
completions = ['ch' + nb + 'completion' for nb in nbs]
totals = ['ch' + nb + 'total' for nb in nbs]
timeLabels = ['totalTime', 'completionTime'] + completions + totals
for timeLabel in timeLabels:
timesPhase1 = allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.loc[timeLabel,:]
timesPhase2 = allDataPlaytestPhase2PretestPosttestUniqueProfilesVolunteers.loc[timeLabel,:]
ttest = ttest_ind(timesPhase1, timesPhase2)
ttest
print(timeLabel + " t test: statistic=" + repr(ttest.statistic) + " pvalue=" + repr(ttest.pvalue))
In [ ]:
allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.index.tolist()
In [ ]:
getRecordPlayer(rmdf1522, gform)
In [ ]:
getRecordPlayer(rmdf160, gform)