%run dataFormating.ipynb
import sklearn
print (sklearn.__version__)
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LassoCV, Lasso
from sklearn.linear_model import RidgeCV, Ridge
from ipywidgets import FloatProgress
from IPython.display import display
from math import *
from scipy import stats
from scipy.stats.mstats import normaltest
from matplotlib.pyplot import boxplot
Note: I am using only decision tree methods here because other methods like naive bayes do not make sense on categorical data
# Select columns that correspond to scientific questions
scientificColumns = [x for x in list(defForms.columns.values) if x[0] == "Q"]
# Pick features and target
features = defForms.loc[:, scientificColumns]
target = defForms["temporality"].astype('int')
# Classify using decision trees -accounts for the small size of the dataset and the categorical nature of the features
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0, max_features="auto")
scores = cross_val_score(clf, features, target)
# Classify using random forests -accounts for the small size of the dataset and the categorical nature of the features, limit overfitting
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
Conclusion: Accuracy is around 85%. Not bad but we expected better (17/01/2018)
# Select columns that correspond to scientific questions
scientificColumns = [x for x in list(defCorrectedForms.columns.values) if x[0] == "Q"]
# Pick features and target
features = defCorrectedForms.loc[:, scientificColumns]
target = defCorrectedForms["temporality"].astype('int')
# Classify using decision trees -accounts for the small size of the dataset and the categorical nature of the features
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0, max_features="auto")
scores = cross_val_score(clf, features, target)
# Classify using random forests -accounts for the small size of the dataset and the categorical nature of the features, limit overfitting
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target)
Conclusion: Accuracy is around 80%. Not bad but we expected better (19/12/2017)
def getPosttestUserIds(gfdf):
return gfdf[gfdf[QTemporality] == answerTemporalities[1]][QUserId].unique()
allDataWebgl1522 = prepareAllData(getAllUserVectorData(
_source = correctAnswers + demographicAnswers,
allDataWebgl1522Volunteers = prepareAllData(getAllUserVectorData(
_source = correctAnswers + demographicAnswers,
allDataWebgl160 = prepareAllData(getAllUserVectorData(
_source = correctAnswers + demographicAnswers,
allDataWebgl160Volunteers = prepareAllData(getAllUserVectorData(
_source = correctAnswers + demographicAnswers,
allDataPlaytestPhase2 = prepareAllData(getAllUserVectorData(
_source = correctAnswers + demographicAnswers,
def getAnonymousData(allDataClassif):
return allDataClassif.drop("anonymousID", axis = 1)
# columns to exclude: contain direct information on posttest score
dropPosttestColumns = allDataClassif.columns & (deltaQuestions + posttestQuestions + ["scoreposttest", "scoredelta"])
dropPretestColumns = allDataClassif.columns & (pretestQuestions + ["scorepretest"])
def getUnscaledFeatures(anonymousData, dropPosttest=True, dropPretest=True):
# Only select rows where scoreafter is not negative
result = anonymousData[anonymousData["scoreposttest"] >= 0]
if dropPosttest:
result = result.drop(dropPosttestColumns, axis = 1)
if dropPretest:
result = result.drop(dropPretestColumns, axis = 1)
return result
def getFeaturesTarget(allDataClassif, chosenModel = Lasso):
# Remove id
anonymousData = getAnonymousData(allDataClassif)
# Get features and target
# Only select rows where scoreafter is not negative
unscaledFeatures = getUnscaledFeatures(anonymousData)
target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]
# Center and scale data
#features = preprocessing.scale(unscaledFeatures)
# Center and scale data variant
standardScaler = preprocessing.StandardScaler()
features = standardScaler.transform(unscaledFeatures)
# Run Lasso regression with cross-validation
model = chosenModel()
scores = cross_val_score(model, features, target, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std())), target)
return scores, standardScaler, model, features, target, unscaledFeatures
scores, standardScaler, model, features, target, unscaledFeatures = getFeaturesTarget(allDataClassif)
def getInvertedCriteria(allData, criteria):
result = allData.copy()
if not (len(result.columns & criteria) == len(criteria)):
print("not all criteria are in input columns")
for criterion in criteria:
result[criterion] = 1 / (1 + result[criterion])
return result
#allDataClassifInv = getAllDataClassif(getInvertedCriteria(allData, totalTimesCriteria + completionTimesCriteria))
allDataClassifInv = getAllDataClassif(getInvertedCriteria(allData, totalTimesCriteria + completionTimesCriteria))
scoresInv, standardScalerInv, modelInv, featuresInv, targetInv, unscaledFeaturesInv = getFeaturesTarget(allDataClassifInv)
#list(set(allDataClassifInv.columns) - set(['anonymousID']))
#criteria = list(\
# set(adc.columns)\
# - set(adc.columns & \
# (deltaQuestions + posttestQuestions
# + pretestQuestions
# + ["scoreposttest", "scoredelta", 'scoreundefined', "anonymousID"]
# + ["scorepretest"]
# + predefinedCriteria
# ))\
getScoresMean(allDataClassifInv, list(set(allDataClassifInv.columns)
- set(
+ posttestQuestions
+ pretestQuestions
+ ["scorepretest", "scoreposttest", "scoredelta", 'scoreundefined', 'anonymousID']
def getPrediction(standardScaler, model, unscaledX):
X = standardScaler.transform([unscaledX])
return model.predict(X)[0]
def getPredictionVsActual(standardScaler, model, allDataClassif):
unscaledFeatures = getUnscaledFeatures(getAnonymousData(allDataClassif))
result = pd.DataFrame(index = unscaledFeatures.index, columns=["predicted", "actual", "error"], data = -1)
for userId in unscaledFeatures.index:
unscaledX = unscaledFeatures.loc[userId].values
actualScore = allDataClassif.loc[userId, "scoreposttest"]
result.loc[userId, "predicted"] = getPrediction(standardScaler, model, unscaledX)
result.loc[userId, "actual"] = actualScore
result.loc[userId, "error"] = result.loc[userId, "predicted"] - result.loc[userId, "actual"]
r2Coef = model.score(standardScaler.transform(unscaledFeatures), result["actual"].values)
return result, r2Coef
samples = [allDataWebgl1522,
for sample in samples:
_allDataClassif = getAllDataClassif(sample)
result, r2Coef = getPredictionVsActual(standardScaler, model, _allDataClassif)
print("{0:0=2d}".format(len(_allDataClassif)) + ": " + str(r2Coef))
_allDataClassifInv = getAllDataClassif(getInvertedCriteria(sample, totalTimesCriteria + completionTimesCriteria))
resultInv, r2CoefInv = getPredictionVsActual(standardScalerInv, modelInv, _allDataClassifInv)
print("{0:0=2d}".format(len(_allDataClassifInv)) + " inv: " + str(r2CoefInv))
def getLassoModelCoefficients(model, unscaledFeatures, useAbs = True):
nonNullIndices = np.nonzero(model.coef_)
data = model.coef_[nonNullIndices]
if useAbs:
data = abs(data)
lassoModelParameters = pd.Series(
index = unscaledFeatures.columns[nonNullIndices],
data = data
return lassoModelParameters
getLassoModelCoefficients(model, unscaledFeatures)
getLassoModelCoefficients(modelInv, unscaledFeatures)
#unscaledFeatures = getUnscaledFeatures(getAnonymousData(allDataClassifWebgl160Volunteers))
#unscaledX = unscaledFeatures.iloc[0].values
#X = standardScaler.transform([unscaledX])
#X = (unscaledX - standardScaler.mean_) / standardScaler.scale_
#model.predict([X])[0], X) + model.intercept_
if False:
anonymousData = getAnonymousData(allDataClassif)
sortedUnscaledFeatures = anonymousData[anonymousData["scoreposttest"] >= 0].sort_values(by="scoreposttest").drop(dropPosttestColumns, axis = 1)
sortedTarget = sorted(anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"])
# Center and scale data variant
sortedFeatures = standardScaler.transform(sortedUnscaledFeatures)
x = range(len(sortedFeatures))
alpha = 0.5
fig, ax = plt.subplots()
plt.title('Actual vs predicted score')
plt.xlabel('User index')
#plt.plot(x, model.predict(sortedFeatures), kind = 'bar')
#plt.plot(x, sortedTarget), model.predict(sortedFeatures), alpha=alpha, label='predicted', linewidth=0), sortedTarget, alpha=alpha, label='actual')
Conclusion: Score cannot be predicted by the table of RedMetrics data (19/07/2018).
def getFeaturesTargetSecondDegreePolynomial(allDataClassif, chosenModel = Lasso):
# Remove id
anonymousData = getAnonymousData(allDataClassif)
# Get features and target
# Only select rows where scoreafter is not negative
unscaledFeatures = getUnscaledFeatures(anonymousData)
target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
unscaledFeatures = secondDegreeFeatures.fit_transform(unscaledFeatures)
# Center and scale data variant
standardScaler = preprocessing.StandardScaler()
features = standardScaler.transform(unscaledFeatures)
# Run Lasso regression with cross-validation
model = chosenModel()
scores = cross_val_score(model, features, target, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std())), target)
return scores, standardScaler, model, features, target, unscaledFeatures
scores1, standardScaler1, model1, features1, target1, unscaledFeatures1 = getFeaturesTargetSecondDegreePolynomial(allDataClassif);
scores2, standardScaler2, model2, features2, target2, unscaledFeatures2 = getFeaturesTargetSecondDegreePolynomial(allDataClassifInv);
Conclusion: Score cannot be predicted by the table of RedMetrics data + second degree polynomial (30/01/2018)
Let's try by reducing the number of features
# Remove id
anonymousData = getAnonymousData(allDataClassifInv)
# Get features and target
# Only select rows where scoreafter is not negative
unscaledFeatures = anonymousData[anonymousData["scoreposttest"] >= 0]
#unscaledFeatures = unscaledFeatures[["craft", "death", "add", "remove", "reach", "maxChapter"] + totalTimesCriteria + completionTimesCriteria]
#unscaledFeatures = unscaledFeatures[["craft", "death", "add", "remove", "reach", "maxChapter"]]
#unscaledFeatures = unscaledFeatures[totalTimesCriteria]
#unscaledFeatures = unscaledFeatures[completionTimesCriteria]
#unscaledFeatures = unscaledFeatures[["maxChapter", "ch05completion", "ch07completion", "ch07total", "ch09total"]]
#unscaledFeatures = unscaledFeatures[['pretest Enjoyed playing', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
#unscaledFeatures = unscaledFeatures[['ch05completion', 'ch08total', 'ch06total', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
if False:#'columnsForRegression' in globals():
unscaledFeatures = unscaledFeatures[columnsForRegression]
# unscaledFeatures = unscaledFeatures[['ch05completion', 'ch08total', 'ch06total', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
# unscaledFeatures = unscaledFeatures[['pretest Enjoyed playing', 'scorepretest', 'pretest Want to learn more about Biology', 'ch05total', 'ch07total']]
unscaledFeatures = unscaledFeatures[['ch02completion', 'ch05completion', 'ch05total', 'ch07total', 'ch08total']]
target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(unscaledFeatures)
# Center and scale data
features = preprocessing.scale(unscaledFeatures)
# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std())), target)
getLassoModelCoefficients(model, unscaledFeatures)
def getScoresMean(allDataClassif, columnsSubset):
anonymousData = getAnonymousData(allDataClassif)
unscaledFeatures = anonymousData[anonymousData["scoreposttest"] >= 0]
unscaledFeatures = unscaledFeatures[columnsSubset]
target = anonymousData[anonymousData["scoreposttest"] >= 0]["scoreposttest"]
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(unscaledFeatures)
features = preprocessing.scale(unscaledFeatures)
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
return scores.mean()
# number of possibles subsets of size n of a set of size 96
import scipy.special
scipy.special.binom(96, 3),\
scipy.special.binom(96, 4),\
scipy.special.binom(96, 5),\
scipy.special.binom(96, 6),\
scipy.special.binom(96, 7),\
scipy.special.binom(96, 8)
from IPython.display import HTML
function code_toggle_err() {
if (code_show_err){
} else {
code_show_err = !code_show_err
$( document ).ready(code_toggle_err);
To toggle on/off output_stderr, click <a href="javascript:code_toggle_err()">here</a>.''')
def getETA(computations, timestamp):
# computing speed: computations per second
computationSpeed = 2794155 / 42338
duration = computations / computationSpeed
eta = timestamp + pd.Timedelta(seconds = duration)
return eta
import itertools
import time
import scipy.special
import warnings
from ipywidgets import Textarea, FloatText, ToggleButton, Checkbox
#adc = allDataClassif.copy()
adc = allDataClassifInv.copy()
# criteria with pretest info
#predefinedCriteria = ['ch05completion', 'scorepretest', 'pretest Want to learn more about Biology', 'ch07total', 'ch05total',]
# criteria with only RM info
#predefinedCriteria = ['ch02completion', 'ch05completion', 'ch05total', 'ch07total', 'ch08total',]
#predefinedCriteria = ['ch02completion', 'ch05completion', 'ch05total', 'ch07total', 'ch08total',]
predefinedCriteria = ['ch06completion', 'ch02completion', 'ch07total', 'ch05total', ]
criteria = list(\
- set(adc.columns & \
(deltaQuestions + posttestQuestions
+ pretestQuestions
+ ["scoreposttest", "scoredelta", 'scoreundefined', "anonymousID"]
+ ["scorepretest"]
+ predefinedCriteria
subsetSize = 4
combinations = scipy.special.binom(len(criteria), subsetSize)
print("ETA " + str(getETA(combinations,
if True:
# very long computation time: > 10h
maxScore = 0.36
i = 0
columnsForRegression = []
iterations = combinations+2
_progress = IntProgress(min=0, max=iterations)
_intText = IntText(0)
_currentBest = FloatText(0.0)
_currentCriteria = Textarea("")
#_stopButton = ToggleButton(value=False, description='Stop')
#_stopCheckbox = Checkbox(value=False, description='Stop')
iterator = itertools.combinations(criteria, subsetSize)
start_time = time.time()
for columnsSubset in iterator:
#if _stopButton.value or _stopCheckbox.value or (i >= iterations):
if (i >= iterations):
i += 1
_progress.value += 1
_intText.value+= 1
score = getScoresMean(adc, list(columnsSubset) + predefinedCriteria)
if score > maxScore:
maxScore = score
_currentBest.value = score
columnsForRegression = list(columnsSubset) + predefinedCriteria
_currentCriteria.value = str(columnsForRegression)
print("--- executed %s / %s in %s seconds ---" % (i, combinations, time.time() - start_time))
print("--- end time: " + str(
maxScore, columnsForRegression
# how long to compute all
(17 * 61124064 / 1000) / 3600,\
(249 * 57940519 / 15000) / 3600,\
(204 * 57940519 / 15000) / 3600,\
# how much computed in some duration
durationSeconds = 5 * 60
durationSeconds * 1000 / 17
Conclusion: Tried different combinations, but cannot find any interesting regression (02/02/2018)
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)
# Get features and target
# Only select rows where scoreafter is not negative
features = anonymousData[anonymousData["scoreposttest"] >= 0]
features = features.loc[:,"sessionsCount":"completionTime"]
target = anonymousData[anonymousData["scoreposttest"] >= 0]["biologyStudy"]
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
Conclusion: No (30/01/2018)
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)
# Get features and target
# Only select rows where scoreafter is not negative
features = anonymousData.loc[:,"sessionsCount":"completionTime"]
target = sum(anonymousData["gameInterest"], anonymousData["gameFrequency"])
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
In [222]:
# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, target, cv=10)
Conclusion: No (30/01/2018)
# Given a question tag, plot scores of cross-validated model
def tryClassification(data, scientificQuestion):
# Remove id
anonymousData = data.drop("anonymousID", axis = 1)
# Get features and target
# Only select rows where scoreafter is not negative
features = anonymousData[anonymousData["scoreposttest"] >= 0]
#features = features.iloc[:,24:37]
features = features.loc[:,criteria]
target = anonymousData[anonymousData["scoreposttest"] >= 0].loc[:,scientificQuestion].astype('int')
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target, cv=5)
# Display plot
fig, ax = plt.subplots()
return [scores.mean(), scores.std()]
scientificQuestionsDescrs = correctAnswers[correctAnswers.apply(len) != 0].index.values.tolist()
set(criteria) in set(anonymousData.columns)#[24:37]
#[c for c in criteria if c not in anonymousData.columns]
#[c for c in anonymousData.columns if c not in criteria]
allScores = pd.DataFrame(index = ["Mean", "Var"])
for question in scientificQuestions:# ["QGenotypePhenotype", "QBioBricksDevicesComposition", "QAmpicillin", "QBBNamePlasmid", "QBBFunctionTER", "QBBNamePromoter", "QBBFunctionGameCDS", "QBBNameTerminator", "QBBFunctionBiologyCDS", "QBBNameRBS", "QBBExampleCDS", "QBBNameCDS", "QBBFunctionPR", "QBBFunctionRBS", "QBBFunctionPlasmid", "QBBNameOperator", "QDeviceRbsPconsFlhdcTer", "QDevicePconsRbsFlhdcTer", "QDevicePbadRbsGfpTer", "QDevicePbadGfpRbsTer", "QDeviceGfpRbsPconsTer", "QDevicePconsGfpRbsTer", "QDeviceAmprRbsPconsTer", "QDeviceRbsPconsAmprTer", "QGreenFluorescence", "QUnequipDevice", "QDevicePbadRbsAraTer"]:
questionTag = question
scores = tryClassification(gameAndCorrectedAfterDataClassif, questionTag)
allScores[questionTag] = scores
allScores.columns = scientificQuestionsDescrs
Conclusion: Redmetrics can be used to predict answers to certain scientific questions (29/05/2018) TODO Raphael: Check which questions you want additional analysis for
In [256]:
#from scipy import stats
def getBoxplot(scores, title = ''):
# figure related code
fig = plt.figure()
ax = fig.add_subplot(111)
In [231]:
#pd.concat([anonymousData.loc[:,"sessionsCount":"completionTime"], anonymousData.loc[:,"gameInterest":"previousPlay"]], axis=1).columns
ingameCriteria = ['sessionsCount', 'scoreposttest', 'scoreundefined', 'complete',
'configure', 'craft', 'death', 'equip', 'unequip', 'add', 'remove',
'gotourl', 'pickup', 'reach', 'restart', 'selectmenu', 'start',
'scoredelta', 'maxChapter', 'efficiency', 'thoroughness', 'fun',
'completionTime', 'ch00completion', 'ch01completion',
'ch02completion', 'ch03completion', 'ch04completion',
'ch05completion', 'ch06completion', 'ch07completion',
'ch08completion', 'ch09completion', 'ch10completion',
'ch11completion', 'ch12completion', 'ch13completion',
'ch14completion', 'ch00total', 'ch01total', 'ch02total',
'ch03total', 'ch04total', 'ch05total', 'ch06total', 'ch07total',
'ch08total', 'ch09total', 'ch10total', 'ch11total', 'ch12total',
'ch13total', 'ch14total', 'totalTime']
# boxplot function
# questions: array of strings of question names
def getPerformanceFromQuestionGroup(questions,
thresholdPercentage = 1.0,
extraTreesClassifier = False,
randomForestClassifier = False,
lasso = False,
histTarget = 0
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)
# Get features and target
#features = pd.concat([anonymousData.loc[:,"sessionsCount":"completionTime"], anonymousData.loc[:,"gameInterest":"previousPlay"]], axis=1)
features = anonymousData.loc[:,ingameCriteria]
digitalTarget = anonymousData.loc[:, questions].astype(int).sum(axis=1)
categoricalTarget = digitalTarget.apply(lambda x: 0 if x < thresholdPercentage*len(questions) else 1)
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
if extraTreesClassifier:
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, categoricalTarget, cv=10)
print("ExtraTreesClassifier scores mean: " + str(scores.mean()))
# Display plot
getBoxplot(scores, "ExtraTreesClassifier boxplot")
if randomForestClassifier:
# Classify using random forests -accounts for the small size of the dataset and the categorical nature of the features, limit overfitting
clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, categoricalTarget)
print("RandomForestClassifier scores mean: " + str(scores.mean()))
# Display plot
getBoxplot(scores, "RandomForestClassifier boxplot")
if lasso:
# Run Lasso regression with cross-validation
model = Lasso()
scores = cross_val_score(model, features, digitalTarget, cv=10)
print("Lasso scores mean: " + str(scores.mean()))
# Display plot
getBoxplot(scores, "Lasso boxplot")
if histTarget > 0:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.hist(target, bins = range(histTarget))
hardQuestions = ["QBBFunctionPR", "QBBNameOperator", "QDevicePbadRbsAraTer"]
getPerformanceFromQuestionGroup(hardQuestions, thresholdPercentage = 0.5, extraTreesClassifier = True, randomForestClassifier = True, lasso = True)
Conclusion: Very high quality prediction (29/05/18)
In [236]:
bbSymbolRecognition = ["QBBNamePlasmid", "QBBFunctionTER", "QBBNamePromoter", "QBBFunctionGameCDS", "QBBNameTerminator", "QBBFunctionBiologyCDS", "QBBNameRBS", "QBBExampleCDS", "QBBNameCDS", "QBBFunctionPR", "QBBFunctionRBS", "QBBFunctionPlasmid", "QBBNameOperator"]
getPerformanceFromQuestionGroup(bbSymbolRecognition, thresholdPercentage = 0.6, extraTreesClassifier = True, randomForestClassifier = True, lasso = True)
Conclusion: No apparent possible prediction (1/02/2018)
In [237]:
easyQuestions = ["QBioBricksDevicesComposition", "QDeviceRbsPconsFlhdcTer", "QGreenFluorescence"]
getPerformanceFromQuestionGroup(easyQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
Conclusion: Inconclusive (01/02/2018)
In [238]:
knowledgeQuestions = ["QAmpicillin",
getPerformanceFromQuestionGroup(knowledgeQuestions, thresholdPercentage = 0.7, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
comprehensionQuestions = ["QBioBricksDevicesComposition",
getPerformanceFromQuestionGroup(comprehensionQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
applicationQuestions = ["QGenotypePhenotype",
getPerformanceFromQuestionGroup(applicationQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
analysisQuestions = ["QBBFunctionGameCDS",
getPerformanceFromQuestionGroup(analysisQuestions, thresholdPercentage = 0.7, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
synthesisQuestions = ["QDeviceRbsPconsFlhdcTer",
getPerformanceFromQuestionGroup(synthesisQuestions, thresholdPercentage = 1.0, extraTreesClassifier = True, randomForestClassifier = True, lasso = True, histTarget = 14)
# Remove id
anonymousData = gameAndCorrectedBeforeDataClassif.drop("anonymousID", axis = 1)
# Get features and target
lastColumn = 'gender_Male'
for potentialLastColumn in ['gender_Other', 'gender_Prefer not to say']:
if potentialLastColumn in anonymousData.columns:
lastColumn = potentialLastColumn
features = anonymousData.loc[:,"gameInterest":lastColumn]
target = anonymousData.loc[:,"completionTime"]
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
# Run Lasso regression with cross-validation
model = Lasso(max_iter=10000, alpha=10)
scores = cross_val_score(model, features, target, cv=10)
# Try classification
target = target.apply(lambda x: 0 if x < 7200 else 1) #0 if short, 1 if long
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target, cv=10)
# Display plot
Conclusion: No (01/02/2018)
# Remove id
anonymousData = gameAndCorrectedAfterDataClassif.drop("anonymousID", axis = 1)
# Get features and target
lastColumn = 'gender_Male'
for potentialLastColumn in ['gender_Other', 'gender_Prefer not to say']:
if potentialLastColumn in anonymousData.columns:
lastColumn = potentialLastColumn
features = anonymousData.loc[:,"gameInterest":lastColumn]
target = anonymousData.loc[:,"completionTime"]
# Add polynomial features
secondDegreeFeatures = preprocessing.PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
features = secondDegreeFeatures.fit_transform(features)
# Center and scale data
features = preprocessing.scale(features)
# Run Lasso regression with cross-validation
model = Lasso(max_iter=1000000)
scores = cross_val_score(model, features, target, cv=10)
# Try classification
target = target.apply(lambda x: 0 if x < 7200 else 1) #0 if short, 1 if long
# Classify using extra tree classifiers, more random than random forest methods
clf = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2, random_state=0, bootstrap=True)
scores = cross_val_score(clf, features, target, cv=10)
# Display plot
Conclusion: Yes (29/05/18)
Conclusion: Yes but very unbalanced classes (29/05/18)
