In [1]:
print(pd.__version__)
import sklearn as skl
print(skl.__version__)
In [3]:
import sys
sys.version
Out[3]:
In [ ]:
#example of chi2
import scipy.stats as stats
observed = titanicTMP[['Embarked', 'AgeEmptyOrNot']].groupby(['Embarked'], as_index=False).apply(lambda x: pd.Series(dict(
WithValue=(x['AgeEmptyOrNot'] == 1).sum(),
EmptyValue=(x['AgeEmptyOrNot'] == 0).sum())))
stats.chi2_contingency(observed= observed)
In [ ]:
# TMP calling dict in param
def binClassif_model_kf(df, predictors, target, nbKF, model, paramDict):
...
# Initialize our algorithm class
if(model == "LinearRegression"):
alg = LinearRegression()
elif(model == "LogisticRegression"):
alg = LogisticRegression()
elif(model == "KNeighborsClassifier"):
alg = KNeighborsClassifier(paramDict['n_neighbors'])
elif(model == "RandomForestClassifier"):
alg = RandomForestClassifier(paramDict['n_estimators'])
# Training the algorithm using the predictors and target
alg.fit(train_predictors, train_target)
algs.append(alg)
.... # return the multiple algoriths and the accuracy
return [algs, accuracy]
In [ ]:
#only model evaluation
accuracyList1 = []
alg = LogisticRegression(random_state=1)
for combination in predictorCombinations:
scores = cross_val_score(alg, titanicTrainClean[combination], titanicTrainClean["Survived"], cv=3)
accuracyList1.append(scores.mean())
#for index in range(len(predictorCombinations)):
# print(combinations[index], accuracyList1[index])
for elementIndex in sort_list(accuracyList1):
print(predictorCombinations[elementIndex], ": ", accuracyList1[elementIndex])
In [ ]:
#combine datasets
combine = [train_df, test_df]
for dataset in combine:
dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1
In [ ]:
import math
#plt.boxplot(titanicTrainClean["Fare"].apply(lambda x: math.log(x)))
plt.boxplot(titanicTrainClean["Fare"].apply(lambda x: x^(1/3)))
In [ ]:
#
# - use predictorCombinations
# - use titanicTrainClean
def getAccuracy_byKF(algModel, nbKF_min, nbKF_max, title):
nbKFList = []
accuracyList = []
for nbKF in range(nbKF_min, nbKF_max+1):
model = binClassifModel_kf(titanicTrainClean[predictorsAll], titanicTrainClean["Survived"], algModel, nbKF)
nbKFList.append(nbKF)
accuracyList.append(model[1])
#plt.plot(nbKFList, accuracyList)
#f, ax = plt.subplots(1)
#ax.plot(nbKFList, accuracyList)
#ax.set_ylim(ymin=0.6)
#plt.show(f)
#plt.subplot(221)
plt.plot(nbKFList, accuracyList)
plt.xlabel('nbKF')
plt.ylabel('accuracy')
plt.title(title)
plt.ylim(ymin=0.6)
plt.grid(True)
plt.show()
In [ ]:
nbKF_min = 3
nbKF_max = 12
algModel = LinearRegression()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "LinearRegression")
algModel = LogisticRegression()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "LogisticRegression")
algModel = GaussianNB()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "GaussianNB")
algModel = KNeighborsClassifier()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "KNeighborsClassifier")
algModel = DecisionTreeClassifier()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "DecisionTreeClassifier")
algModel = RandomForestClassifier()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "RandomForestClassifier")
In [ ]:
titanicDFs = [train_df, test_df]
for dataset in combine:
titanicTrainClean = pd.DataFrame({
"Survived": titanicTrain["Survived"]
})
titanicTrainClean = pd.DataFrame({
"Survived": titanicTrain["Survived"]
})
In [ ]:
titanicTrainClean['FareBand'] = pd.qcut(titanicTrainClean['Fare'], 4)
titanicTrainClean[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)
In [ ]:
titanicTrainClean.loc[ titanicTrainClean['Fare'] <= 7.91, 'Fare2'] = 0
titanicTrainClean.loc[(titanicTrainClean['Fare'] > 7.91) & (titanicTrainClean['Fare'] <= 14.454), 'Fare2'] = 1
titanicTrainClean.loc[(titanicTrainClean['Fare'] > 14.454) & (titanicTrainClean['Fare'] <= 31), 'Fare2'] = 2
titanicTrainClean.loc[ titanicTrainClean['Fare'] > 31, 'Fare2'] = 3
titanicTrainClean['Fare2'] = titanicTrainClean['Fare2'].astype(int)
In [ ]:
sliceDuration = 10
TempDS = pd.DataFrame({
"AgeDiscrete": round(titanicTrainDS["Age"]//sliceDuration)*sliceDuration,
"Survived": titanicTrainDS["Survived"]
})
vectTemp = titanicTemp["AgeDiscrete"]
titanicTemp["AgeDiscrete"] = vectTemp.map(str) + "-" + (vectTemp + sliceDuration).map(str)
titanicTemp[["AgeDiscrete", "Survived"]].groupby(["AgeDiscrete"], as_index=False).mean()
In [ ]:
# Helper that return create all combinations of predictors
myList = predictorsAll
predictorCombinations = [] # all combination of predictord
for index in range(1, len(myList)+1):
for subset in itertools.combinations(myList, index):
predictorCombinations.append(list(subset))
#predictorCombinations
In [ ]:
algModel = RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=2)
algModel = RandomForestClassifier(n_estimators=100)
algModel = RandomForestClassifier(n_estimators=100)
predictors = ['Pclass', 'Sex', 'Age', 'Parch', 'Fare']
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
result = binClassifModel_kf(titanicTrainDS[predictors], titanicTrainDS["Survived"], algModel, 5)
algList = result[0]
predictionsList = []
for alg in algList:
predictions = alg.predict(titanicTestDS[predictors])
predictionsList.append(predictions)
predictionsList2 = []
for alg in algList:
predictions = alg.predict(titanicTrainDS[predictors])
predictionsList2.append(predictions)
# There are different preditions, we take the mean (a voting-like system)
predictionsFinal = np.mean(predictionsList, axis=0)
predictionsFinal2 = np.mean(predictionsList2, axis=0)
# Map predictions to outcomes (the only possible outcomes are 1 and 0)
predictionsFinal[predictionsFinal > .5] = 1
predictionsFinal[predictionsFinal <=.5] = 0
In [ ]:
print(np.mean(predictionsList, axis=0).mean())
print(np.mean(predictionsList2, axis=0).mean())
print(titanicTrainDS["Survived"].mean())
In [ ]: