In [174]:
%run dataFormating.ipynb
In [175]:
import sklearn
print (sklearn.__version__)
In [176]:
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LassoCV, Lasso
from sklearn.linear_model import RidgeCV, Ridge
from ipywidgets import FloatProgress
from IPython.display import display
from math import *
from scipy import stats
from scipy.stats.mstats import normaltest
from matplotlib.pyplot import boxplot
Note: I am using only decision tree methods here because other methods like naive bayes do not make sense on categorical data
In [177]:
# Select columns that correspond to scientific questions
scientificColumns = [x for x in list(defForms.columns.values) if x[0] == "Q"]
# Pick features and target
features = defForms.loc[:, scientificColumns]
target = defForms["temporality"].astype('int')
In [178]:
# Classify using decision trees -accounts for the small size of the dataset and the categorical nature of the features
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0, max_features="auto")
scores = cross_val_score(clf, features, target)
scores.mean()
Out[178]:
In [179]:
# Classify using random forests -accounts for the small size of the dataset and the categorical nature of the features, limit overfitting
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
scores.mean()
Out[179]:
In [180]:
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
scores.mean()
Out[180]:
Conclusion: Accuracy is around 85%. Not bad but we expected better (17/01/2018)
In [181]:
# Select columns that correspond to scientific questions
scientificColumns = [x for x in list(defCorrectedForms.columns.values) if x[0] == "Q"]
# Pick features and target
features = defCorrectedForms.loc[:, scientificColumns]
target = defCorrectedForms["temporality"].astype('int')
In [182]:
# Classify using decision trees -accounts for the small size of the dataset and the categorical nature of the features
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0, max_features="auto")
scores = cross_val_score(clf, features, target)
scores.mean()
Out[182]:
In [183]:
# Classify using random forests -accounts for the small size of the dataset and the categorical nature of the features, limit overfitting
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
scores.mean()
Out[183]:
In [184]:
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
scores.mean()
Out[184]:
Conclusion: Accuracy is around 80%. Not bad but we expected better (19/12/2017)
In [185]:
def getPosttestUserIds(gfdf):
return gfdf[gfdf[QTemporality] == answerTemporalities[1]][QUserId].unique()
In [186]:
allDataWebgl1522 = prepareAllData(getAllUserVectorData(
getPosttestUserIds(gfdfWebgl1522UniqueProfiles),
rmdfWebgl1522UniqueProfiles,
gfdfWebgl1522UniqueProfiles,
_source = correctAnswers + demographicAnswers,
_printDebug=False))
In [187]:
allDataWebgl1522Volunteers = prepareAllData(getAllUserVectorData(
getAllResponders(gfdfWebgl1522PretestPosttestUniqueProfilesVolunteers),
rmdfWebgl1522PretestPosttestUniqueProfilesVolunteers,
gfdfWebgl1522PretestPosttestUniqueProfilesVolunteers,
_source = correctAnswers + demographicAnswers,
_printDebug=False))
In [188]:
allDataWebgl160 = prepareAllData(getAllUserVectorData(
getPosttestUserIds(gfdfWebgl160UniqueProfiles),
rmdfWebgl160UniqueProfiles,
gfdfWebgl160UniqueProfiles,
_source = correctAnswers + demographicAnswers,
_printDebug=False))
In [189]:
allDataWebgl160Volunteers = prepareAllData(getAllUserVectorData(
getAllResponders(gfdfWebgl160PretestPosttestUniqueProfilesVolunteers),
rmdfWebgl160PretestPosttestUniqueProfilesVolunteers,
gfdfWebgl160PretestPosttestUniqueProfilesVolunteers,
_source = correctAnswers + demographicAnswers,
_printDebug=False))
In [190]:
allDataPlaytestPhase2 = prepareAllData(getAllUserVectorData(
getAllResponders(gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers),
rmdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
gfdfPlaytestPhase2PretestPosttestUniqueProfilesVolunteers,
_source = correctAnswers + demographicAnswers,
_printDebug=False))
In [191]:
def getAnonymousData(allDataClassif):
return allDataClassif.drop("anonymousID", axis = 1)
In [192]:
# columns to exclude: contain direct information on posttest score
dropPosttestColumns = allDataClassif.columns & (deltaQuestions + posttestQuestions + ["scoreposttest", "scoredelta"])
dropPretestColumns = allDataClassif.columns & (pretestQuestions + ["scorepretest"])
In [193]:
def getUnscaledFeatures(anonymousData, dropPosttest=True, dropPretest=True):
# Only select rows where scoreafter is not negative
result = anonymousData[anonymousData["scoreposttest"] >= 0]
if dropPosttest:
result = result.drop(dropPosttestColumns, axis = 1)
if dropPretest:
result = result.drop(dropPretestColumns, axis = 1)
return result
In [194]:
def getFeaturesTarget(allDataClassif, chosenModel = Lasso):
# Remove id
anonymousData = getAnonymousData(allDataClassif)
# Get features and target
# Only select rows where scoreafter is not negative
unscaledFeatures = getUnscaledFeatures(anonymousData)
target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]
# Center and scale data
#features = preprocessing.scale(unscaledFeatures)
# Center and scale data variant
standardScaler = preprocessing.StandardScaler()
standardScaler.fit(unscaledFeatures)
features = standardScaler.transform(unscaledFeatures)
# Run Lasso regression with cross-validation
model = chosenModel()
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
model.fit(features, target)
return scores, standardScaler, model, features, target, unscaledFeatures
In [195]:
scores, standardScaler, model, features, target, unscaledFeatures = getFeaturesTarget(allDataClassif)
In [196]:
def getInvertedCriteria(allData, criteria):
result = allData.copy()
if not (len(result.columns & criteria) == len(criteria)):
print("not all criteria are in input columns")
for criterion in criteria:
result[criterion] = 1 / (1 + result[criterion])
return result
In [197]:
#allDataClassifInv = getAllDataClassif(getInvertedCriteria(allData, totalTimesCriteria + completionTimesCriteria))
allDataClassifInv = getAllDataClassif(getInvertedCriteria(allData, totalTimesCriteria + completionTimesCriteria))
scoresInv, standardScalerInv, modelInv, featuresInv, targetInv, unscaledFeaturesInv = getFeaturesTarget(allDataClassifInv)
In [198]:
#list(set(allDataClassifInv.columns) - set(['anonymousID']))
In [199]:
#criteria = list(\
# set(adc.columns)\
# - set(adc.columns & \
# (deltaQuestions + posttestQuestions
# + pretestQuestions
# + ["scoreposttest", "scoredelta", 'scoreundefined', "anonymousID"]
# + ["scorepretest"]
# + predefinedCriteria
# ))\
#)
getScoresMean(allDataClassifInv, list(set(allDataClassifInv.columns)
- set(
deltaQuestions
+ posttestQuestions
+ pretestQuestions
+ ["scorepretest", "scoreposttest", "scoredelta", 'scoreundefined', 'anonymousID']
)))
Out[199]:
In [200]:
def getPrediction(standardScaler, model, unscaledX):
X = standardScaler.transform([unscaledX])
return model.predict(X)[0]
In [201]:
def getPredictionVsActual(standardScaler, model, allDataClassif):
unscaledFeatures = getUnscaledFeatures(getAnonymousData(allDataClassif))
result = pd.DataFrame(index = unscaledFeatures.index, columns=["predicted", "actual", "error"], data = -1)
for userId in unscaledFeatures.index:
unscaledX = unscaledFeatures.loc[userId].values
actualScore = allDataClassif.loc[userId, "scoreposttest"]
result.loc[userId, "predicted"] = getPrediction(standardScaler, model, unscaledX)
result.loc[userId, "actual"] = actualScore
result.loc[userId, "error"] = result.loc[userId, "predicted"] - result.loc[userId, "actual"]
r2Coef = model.score(standardScaler.transform(unscaledFeatures), result["actual"].values)
return result, r2Coef
In [202]:
samples = [allDataWebgl1522,
allDataWebgl1522Volunteers,
allDataWebgl160,
allDataWebgl160Volunteers,
allDataPlaytestPhase2
]
for sample in samples:
_allDataClassif = getAllDataClassif(sample)
result, r2Coef = getPredictionVsActual(standardScaler, model, _allDataClassif)
print("{0:0=2d}".format(len(_allDataClassif)) + ": " + str(r2Coef))
_allDataClassifInv = getAllDataClassif(getInvertedCriteria(sample, totalTimesCriteria + completionTimesCriteria))
resultInv, r2CoefInv = getPredictionVsActual(standardScalerInv, modelInv, _allDataClassifInv)
print("{0:0=2d}".format(len(_allDataClassifInv)) + " inv: " + str(r2CoefInv))
In [203]:
def getLassoModelCoefficients(model, unscaledFeatures, useAbs = True):
nonNullIndices = np.nonzero(model.coef_)
data = model.coef_[nonNullIndices]
if useAbs:
data = abs(data)
lassoModelParameters = pd.Series(
index = unscaledFeatures.columns[nonNullIndices],
data = data
).sort_values()
return lassoModelParameters
In [204]:
getLassoModelCoefficients(model, unscaledFeatures)
Out[204]:
In [205]:
getLassoModelCoefficients(modelInv, unscaledFeatures)
Out[205]:
In [206]:
#unscaledFeatures = getUnscaledFeatures(getAnonymousData(allDataClassifWebgl160Volunteers))
#unscaledX = unscaledFeatures.iloc[0].values
#X = standardScaler.transform([unscaledX])
#model.predict(X)[0]
#X = (unscaledX - standardScaler.mean_) / standardScaler.scale_
#model.predict([X])[0]
#np.dot(model.coef_, X) + model.intercept_
In [207]:
if False:
anonymousData = getAnonymousData(allDataClassif)
sortedUnscaledFeatures = anonymousData[anonymousData["scoreposttest"] >= 0].sort_values(by="scoreposttest").drop(dropPosttestColumns, axis = 1)
sortedTarget = sorted(anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"])
# Center and scale data variant
sortedFeatures = standardScaler.transform(sortedUnscaledFeatures)
x = range(len(sortedFeatures))
alpha = 0.5
fig, ax = plt.subplots()
plt.title('Actual vs predicted score')
plt.xlabel('User index')
plt.ylabel('Score')
#plt.plot(x, model.predict(sortedFeatures), kind = 'bar')
#plt.plot(x, sortedTarget)
ax.bar(x, model.predict(sortedFeatures), alpha=alpha, label='predicted', linewidth=0)
ax.bar(x, sortedTarget, alpha=alpha, label='actual')
ax.legend()
fig.tight_layout()
plt.show()
Conclusion: Score cannot be predicted by the table of RedMetrics data (19/07/2018).
In [208]:
def getFeaturesTargetSecondDegreePolynomial(allDataClassif, chosenModel = Lasso):
# Remove id
anonymousData = getAnonymousData(allDataClassif)
# Get features and target
# Only select rows where scoreafter is not negative
unscaledFeatures = getUnscaledFeatures(anonymousData)
target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
unscaledFeatures = secondDegreeFeatures.fit_transform(unscaledFeatures)
# Center and scale data variant
standardScaler = preprocessing.StandardScaler()
standardScaler.fit(unscaledFeatures)
features = standardScaler.transform(unscaledFeatures)
# Run Lasso regression with cross-validation
model = chosenModel()
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
model.fit(features, target)
return scores, standardScaler, model, features, target, unscaledFeatures
In [209]:
scores1, standardScaler1, model1, features1, target1, unscaledFeatures1 = getFeaturesTargetSecondDegreePolynomial(allDataClassif);
scores2, standardScaler2, model2, features2, target2, unscaledFeatures2 = getFeaturesTargetSecondDegreePolynomial(allDataClassifInv);
Conclusion: Score cannot be predicted by the table of RedMetrics data + second degree polynomial (30/01/2018)
Let's try by reducing the number of features
In [210]:
# Remove id
anonymousData = getAnonymousData(allDataClassifInv)
# Get features and target
# Only select rows where scoreafter is not negative
unscaledFeatures = anonymousData[anonymousData["scoreposttest"] >= 0]
#unscaledFeatures = unscaledFeatures[["craft", "death", "add", "remove", "reach", "maxChapter"] + totalTimesCriteria + completionTimesCriteria]
#unscaledFeatures = unscaledFeatures[["craft", "death", "add", "remove", "reach", "maxChapter"]]
#unscaledFeatures = unscaledFeatures[totalTimesCriteria]
#unscaledFeatures = unscaledFeatures[completionTimesCriteria]
#unscaledFeatures = unscaledFeatures[["maxChapter", "ch05completion", "ch07completion", "ch07total", "ch09total"]]
#unscaledFeatures = unscaledFeatures[['pretest Enjoyed playing', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
#unscaledFeatures = unscaledFeatures[['ch05completion', 'ch08total', 'ch06total', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
if False:#'columnsForRegression' in globals():
unscaledFeatures = unscaledFeatures[columnsForRegression]
else:
# unscaledFeatures = unscaledFeatures[['ch05completion', 'ch08total', 'ch06total', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
# unscaledFeatures = unscaledFeatures[['pretest Enjoyed playing', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
unscaledFeatures = unscaledFeatures[['ch02completion', 'ch05completion', 'ch05total', 'ch07total', 'ch08total']]
target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(unscaledFeatures)
# Center and scale data
features = preprocessing.scale(unscaledFeatures)
# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()))
model.fit(features, target)
getLassoModelCoefficients(model, unscaledFeatures)
Out[210]:
In [211]:
def getScoresMean(allDataClassif, columnsSubset):
anonymousData = getAnonymousData(allDataClassif)
unscaledFeatures = anonymousData[anonymousData["scoreposttest"] >= 0]
unscaledFeatures = unscaledFeatures[columnsSubset]
target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(unscaledFeatures)
features = preprocessing.scale(unscaledFeatures)
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
return scores.mean()
In [212]:
# number of possibles subsets of size n of a set of size 96
import scipy.special
scipy.special.binom(96, 3),\
scipy.special.binom(96, 4),\
scipy.special.binom(96, 5),\
scipy.special.binom(96, 6),\
scipy.special.binom(96, 7),\
scipy.special.binom(96, 8)
Out[212]:
In [213]:
from IPython.display import HTML
HTML('''<script>
code_show_err=false;
function code_toggle_err() {
if (code_show_err){
$('div.output_stderr').hide();
} else {
$('div.output_stderr').show();
}
code_show_err = !code_show_err
}
$( document ).ready(code_toggle_err);
</script>
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')
Out[213]:
In [214]:
def getETA(computations, timestamp):
# computing speed: computations per second
computationSpeed = 2794155 / 42338
duration = computations / computationSpeed
eta = timestamp + pd.Timedelta(seconds = duration)
return eta
In [ ]:
import itertools
import time
import scipy.special
import warnings
from ipywidgets import Textarea, FloatText, ToggleButton, Checkbox
warnings.filterwarnings('ignore')
#adc = allDataClassif.copy()
adc = allDataClassifInv.copy()
# criteria with pretest info
#predefinedCriteria = ['ch05completion', 'scorepretest', 'pretest Want to learn more about Biology', 'ch07total', 'ch05total',]
# criteria with only RM info
#predefinedCriteria = ['ch02completion', 'ch05completion', 'ch05total', 'ch07total', 'ch08total',]
#predefinedCriteria = ['ch02completion', 'ch05completion', 'ch05total', 'ch07total', 'ch08total',]
predefinedCriteria = ['ch06completion', 'ch02completion', 'ch07total', 'ch05total', ]
criteria = list(\
set(adc.columns)\
- set(adc.columns & \
(deltaQuestions + posttestQuestions
+ pretestQuestions
+ ["scoreposttest", "scoredelta", 'scoreundefined', "anonymousID"]
+ ["scorepretest"]
+ predefinedCriteria
))\
)
subsetSize = 4
combinations = scipy.special.binom(len(criteria), subsetSize)
print("#combinations="+str(combinations))
print("ETA " + str(getETA(combinations, pd.Timestamp.now())))
In [ ]:
if True:
# very long computation time: > 10h
maxScore = 0.36
i = 0
columnsForRegression = []
iterations = combinations+2
_progress = IntProgress(min=0, max=iterations)
_intText = IntText(0)
_currentBest = FloatText(0.0)
_currentCriteria = Textarea("")
#_stopButton = ToggleButton(value=False, description='Stop')
#_stopCheckbox = Checkbox(value=False, description='Stop')
display(_progress)
display(_intText)
display(_currentBest)
display(_currentCriteria)
#display(_stopButton)
#display(_stopCheckbox)
iterator = itertools.combinations(criteria, subsetSize)
start_time = time.time()
for columnsSubset in iterator:
#if _stopButton.value or _stopCheckbox.value or (i >= iterations):
if (i >= iterations):
break
else:
i += 1
_progress.value += 1
_intText.value+= 1
score = getScoresMean(adc, list(columnsSubset) + predefinedCriteria)
if score > maxScore:
maxScore = score
_currentBest.value = score
columnsForRegression = list(columnsSubset) + predefinedCriteria
_currentCriteria.value = str(columnsForRegression)
print("--- executed %s / %s in %s seconds ---" % (i, combinations, time.time() - start_time))
print("--- end time: " + str(pd.Timestamp.now()))
maxScore, columnsForRegression
In [217]:
# how long to compute all
(17 * 61124064 / 1000) / 3600,\
(249 * 57940519 / 15000) / 3600,\
(204 * 57940519 / 15000) / 3600,\
Out[217]:
In [218]:
# how much computed in some duration
durationSeconds = 5 * 60
durationSeconds * 1000 / 17
Out[218]:
Conclusion: Tried different combinations, but cannot find any interesting regression (02/02/2018)
In [219]:
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)
# Get features and target
# Only select rows where scoreafter is not negative
features = anonymousData[anonymousData["scoreposttest"] >= 0]
features = features.loc[:,"sessionsCount":"completionTime"]
target = anonymousData[anonymousData["scoreposttest"] >= 0]["biologyStudy"]
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
In [220]:
# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
scores
Out[220]:
Conclusion: No (30/01/2018)
In [221]:
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)
# Get features and target
# Only select rows where scoreafter is not negative
features = anonymousData.loc[:,"sessionsCount":"completionTime"]
target = sum(anonymousData["gameInterest"], anonymousData["gameFrequency"])
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
In [222]:
# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
scores
Out[222]:
Conclusion: No (30/01/2018)
In [254]:
# Given a question tag, plot scores of cross-validated model
def tryClassification(data, scientificQuestion):
# Remove id
anonymousData = data.drop("anonymousID", axis = 1)
# Get features and target
# Only select rows where scoreafter is not negative
features = anonymousData[anonymousData["scoreposttest"] >= 0]
#features = features.iloc[:,24:37]
features = features.loc[:,criteria]
target = anonymousData[anonymousData["scoreposttest"] >= 0].loc[:,scientificQuestion].astype('int')
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target, cv=5)
# Display plot
fig, ax = plt.subplots()
boxplot(scores)
return [scores.mean(), scores.std()]
In [224]:
scientificQuestionsDescrs = correctAnswers[correctAnswers.apply(len) != 0].index.values.tolist()
#scientificQuestionsDescrs
In [225]:
anonymousData.columns[24:37]
Out[225]:
In [226]:
set(criteria) in set(anonymousData.columns)#[24:37]
Out[226]:
In [227]:
#[c for c in criteria if c not in anonymousData.columns]
#[c for c in anonymousData.columns if c not in criteria]
#anonymousData[criteria]
In [255]:
allScores = pd.DataFrame(index = ["Mean", "Var"])
for question in scientificQuestions:# ["QGenotypePhenotype", "QBioBricksDevicesComposition", "QAmpicillin", "QBBNamePlasmid", "QBBFunctionTER", "QBBNamePromoter", "QBBFunctionGameCDS", "QBBNameTerminator", "QBBFunctionBiologyCDS", "QBBNameRBS", "QBBExampleCDS", "QBBNameCDS", "QBBFunctionPR", "QBBFunctionRBS", "QBBFunctionPlasmid", "QBBNameOperator", "QDeviceRbsPconsFlhdcTer", "QDevicePconsRbsFlhdcTer", "QDevicePbadRbsGfpTer", "QDevicePbadGfpRbsTer", "QDeviceGfpRbsPconsTer", "QDevicePconsGfpRbsTer", "QDeviceAmprRbsPconsTer", "QDeviceRbsPconsAmprTer", "QGreenFluorescence", "QUnequipDevice", "QDevicePbadRbsAraTer"]:
questionTag = question
scores = tryClassification(gameAndCorrectedAfterDataClassif, questionTag)
allScores[questionTag] = scores
allScores.columns = scientificQuestionsDescrs
allScores.T
Out[255]:
Conclusion: Redmetrics can be used to predict answers to certain scientific questions (29/05/2018) TODO Raphael: Check which questions you want additional analysis for
In [256]:
#from scipy import stats
stats.describe(allScores.loc['Mean',:])
Out[256]:
In [230]:
def getBoxplot(scores, title = ''):
# figure related code
fig = plt.figure()
ax = fig.add_subplot(111)
ax.boxplot(scores)
ax.set_title(title)
In [231]:
#pd.concat([anonymousData.loc[:,"sessionsCount":"completionTime"], anonymousData.loc[:,"gameInterest":"previousPlay"]], axis=1).columns
In [232]:
#anonymousData.columns.values
In [233]:
ingameCriteria = ['sessionsCount', 'scoreposttest', 'scoreundefined', 'complete',
'configure', 'craft', 'death', 'equip', 'unequip', 'add', 'remove',
'gotourl', 'pickup', 'reach', 'restart', 'selectmenu', 'start',
'scoredelta', 'maxChapter', 'efficiency', 'thoroughness', 'fun',
'completionTime', 'ch00completion', 'ch01completion',
'ch02completion', 'ch03completion', 'ch04completion',
'ch05completion', 'ch06completion', 'ch07completion',
'ch08completion', 'ch09completion', 'ch10completion',
'ch11completion', 'ch12completion', 'ch13completion',
'ch14completion', 'ch00total', 'ch01total', 'ch02total',
'ch03total', 'ch04total', 'ch05total', 'ch06total', 'ch07total',
'ch08total', 'ch09total', 'ch10total', 'ch11total', 'ch12total',
'ch13total', 'ch14total', 'totalTime']
In [234]:
# boxplot function
# questions: array of strings of question names
def getPerformanceFromQuestionGroup(questions,
thresholdPercentage = 1.0,
extraTreesClassifier = False,
randomForestClassifier = False,
lasso = False,
histTarget = 0
):
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)
# Get features and target
#features = pd.concat([anonymousData.loc[:,"sessionsCount":"completionTime"], anonymousData.loc[:,"gameInterest":"previousPlay"]], axis=1)
features = anonymousData.loc[:,ingameCriteria]
digitalTarget = anonymousData.loc[:, questions].astype(int).sum(axis=1)
categoricalTarget = digitalTarget.apply(lambda x: 0 if x < thresholdPercentage*len(questions) else 1)
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
if extraTreesClassifier:
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, categoricalTarget, cv=10)
print("ExtraTreesClassifier scores mean: " + str(scores.mean()))
# Display plot
getBoxplot(scores, "ExtraTreesClassifier boxplot")
if randomForestClassifier:
# Classify using random forests -accounts for the small size of the dataset and the categorical nature of the features, limit overfitting
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, categoricalTarget)
print("RandomForestClassifier scores mean: " + str(scores.mean()))
# Display plot
getBoxplot(scores, "RandomForestClassifier boxplot")
if lasso:
# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, digitalTarget, cv=10)
print("Lasso scores mean: " + str(scores.mean()))
# Display plot
getBoxplot(scores, "Lasso boxplot")
if histTarget > 0:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(target, bins = range(histTarget))
In [235]:
hardQuestions = ["QBBFunctionPR", "QBBNameOperator", "QDevicePbadRbsAraTer"]
getPerformanceFromQuestionGroup(hardQuestions, thresholdPercentage = 0.5, extraTreesClassifier = True, randomForestClassifier = True, lasso = True)
Conclusion: Very high quality prediction (29/05/18)
In [236]:
bbSymbolRecognition = ["QBBNamePlasmid", "QBBFunctionTER", "QBBNamePromoter", "QBBFunctionGameCDS", "QBBNameTerminator", "QBBFunctionBiologyCDS", "QBBNameRBS", "QBBExampleCDS", "QBBNameCDS", "QBBFunctionPR", "QBBFunctionRBS", "QBBFunctionPlasmid", "QBBNameOperator"]
getPerformanceFromQuestionGroup(bbSymbolRecognition, thresholdPercentage = 0.6, extraTreesClassifier = True, randomForestClassifier = True, lasso = True)
Conclusion: No apparent possible prediction (1/02/2018)
In [237]:
easyQuestions = ["QBioBricksDevicesComposition", "QDeviceRbsPconsFlhdcTer", "QGreenFluorescence"]
getPerformanceFromQuestionGroup(easyQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
Conclusion: Inconclusive (01/02/2018)
In [238]:
knowledgeQuestions = ["QAmpicillin",
"QBBNamePlasmid",
"QBBNamePromoter",
"QBBNameTerminator",
"QBBNameRBS",
"QBBNameCDS",
"QBBNameOperator",
]
getPerformanceFromQuestionGroup(knowledgeQuestions, thresholdPercentage = 0.7, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
In [239]:
comprehensionQuestions = ["QBioBricksDevicesComposition",
"QBBFunctionTER",
"QBBFunctionPlasmid",
"QUnequipDevice",
]
getPerformanceFromQuestionGroup(comprehensionQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
In [240]:
applicationQuestions = ["QGenotypePhenotype",
"QBBExampleCDS",
"QGreenFluorescence",
]
getPerformanceFromQuestionGroup(applicationQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
In [241]:
analysisQuestions = ["QBBFunctionGameCDS",
"QBBFunctionBiologyCDS",
"QBBFunctionPR",
"QBBFunctionRBS",
"QDevicePbadRbsAraTer",
]
getPerformanceFromQuestionGroup(analysisQuestions, thresholdPercentage = 0.7, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
In [242]:
synthesisQuestions = ["QDeviceRbsPconsFlhdcTer",
"QDevicePconsRbsFlhdcTer",
"QDevicePbadRbsGfpTer",
"QDevicePbadGfpRbsTer",
"QDeviceGfpRbsPconsTer",
"QDevicePconsGfpRbsTer",
"QDeviceAmprRbsPconsTer",
"QDeviceRbsPconsAmprTer",
]
getPerformanceFromQuestionGroup(synthesisQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
In [243]:
# Remove id
anonymousData = gameAndCorrectedBeforeDataClassif.drop("anonymousID", axis = 1)
# Get features and target
lastColumn = 'gender_Male'
for potentialLastColumn in ['gender_Other', 'gender_Prefer not to say']:
if potentialLastColumn in anonymousData.columns:
lastColumn = potentialLastColumn
features = anonymousData.loc[:,"gameInterest":lastColumn]
target = anonymousData.loc[:,"completionTime"]
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
In [244]:
# Run Lasso regression with cross-validation
model = Lasso(max_iter=10000, alpha=10)
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
scores.mean()
Out[244]:
In [245]:
# Try classification
target = target.apply(lambda x: 0 if x < 7200 else 1) #0 if short, 1 if long
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target, cv=10)
# Display plot
boxplot(scores)
scores.mean()
sum(target)/len(target)
Out[245]:
Conclusion: No (01/02/2018)
In [246]:
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)
# Get features and target
lastColumn = 'gender_Male'
for potentialLastColumn in ['gender_Other', 'gender_Prefer not to say']:
if potentialLastColumn in anonymousData.columns:
lastColumn = potentialLastColumn
features = anonymousData.loc[:,"gameInterest":lastColumn]
target = anonymousData.loc[:,"completionTime"]
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
In [247]:
# Run Lasso regression with cross-validation
model = Lasso(max_iter=1000000)
scores = cross_val_score(model, features, target, cv=10)
boxplot(scores)
scores.mean()
Out[247]:
In [248]:
# Try classification
target = target.apply(lambda x: 0 if x < 7200 else 1) #0 if short, 1 if long
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target, cv=10)
# Display plot
boxplot(scores)
scores.mean()
Out[248]:
Conclusion: Yes (29/05/18)
In [249]:
sum(target)
Out[249]:
In [250]:
len(target)
Out[250]:
Conclusion: Yes but very unbalanced classes (29/05/18)
In [ ]: