In [1]:
print(pd.__version__)
import sklearn as skl
print(skl.__version__)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-39e5887af36f> in <module>()
----> 1 print(pd.__version__)
      2 import sklearn as skl
      3 print(skl.__version__)

NameError: name 'pd' is not defined

In [3]:
import sys
sys.version


Out[3]:
'3.6.0 (v3.6.0:41df79263a11, Dec 22 2016, 17:23:13) \n[GCC 4.2.1 (Apple Inc. build 5666) (dot 3)]'

stat chi-2


In [ ]:
#example of chi2
import scipy.stats as stats
observed = titanicTMP[['Embarked', 'AgeEmptyOrNot']].groupby(['Embarked'], as_index=False).apply(lambda x: pd.Series(dict(
    WithValue=(x['AgeEmptyOrNot'] == 1).sum(),
    EmptyValue=(x['AgeEmptyOrNot'] == 0).sum())))
stats.chi2_contingency(observed= observed)

In [ ]:
# TMP calling dict in param
def binClassif_model_kf(df, predictors, target, nbKF, model, paramDict):
...
        
        # Initialize our algorithm class
        if(model == "LinearRegression"):
            alg = LinearRegression()
        elif(model == "LogisticRegression"):
            alg = LogisticRegression()
        elif(model == "KNeighborsClassifier"):
            alg = KNeighborsClassifier(paramDict['n_neighbors'])
        elif(model == "RandomForestClassifier"):
            alg = RandomForestClassifier(paramDict['n_estimators'])
        # Training the algorithm using the predictors and target
        alg.fit(train_predictors, train_target)
        algs.append(alg)
        
....    # return the multiple algoriths and the accuracy
    return [algs, accuracy]

In [ ]:
#only model evaluation
accuracyList1 = []
alg = LogisticRegression(random_state=1)
for combination in predictorCombinations:
    scores = cross_val_score(alg, titanicTrainClean[combination], titanicTrainClean["Survived"], cv=3)
    accuracyList1.append(scores.mean())
    
#for index in range(len(predictorCombinations)):
#    print(combinations[index], accuracyList1[index])

for elementIndex in sort_list(accuracyList1):
    print(predictorCombinations[elementIndex], ": ", accuracyList1[elementIndex])

In [ ]:
#combine datasets
combine = [train_df, test_df]
for dataset in combine:
    dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1

In [ ]:
import math
#plt.boxplot(titanicTrainClean["Fare"].apply(lambda x: math.log(x)))
plt.boxplot(titanicTrainClean["Fare"].apply(lambda x: x^(1/3)))

testing kf value


In [ ]:
# 
# - use predictorCombinations
# - use titanicTrainClean
def getAccuracy_byKF(algModel, nbKF_min, nbKF_max, title):
    nbKFList = []
    accuracyList = []
    
    for nbKF in range(nbKF_min, nbKF_max+1):
        model = binClassifModel_kf(titanicTrainClean[predictorsAll], titanicTrainClean["Survived"], algModel, nbKF)
        nbKFList.append(nbKF)
        accuracyList.append(model[1])
        
    #plt.plot(nbKFList, accuracyList)
    #f, ax = plt.subplots(1)
    #ax.plot(nbKFList, accuracyList)
    #ax.set_ylim(ymin=0.6)
    #plt.show(f)
    
    #plt.subplot(221)
    plt.plot(nbKFList, accuracyList)
    plt.xlabel('nbKF')
    plt.ylabel('accuracy')
    plt.title(title)
    plt.ylim(ymin=0.6)
    plt.grid(True)
    plt.show()

In [ ]:
nbKF_min = 3
nbKF_max = 12

algModel = LinearRegression()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "LinearRegression")

algModel = LogisticRegression()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "LogisticRegression")

algModel = GaussianNB()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "GaussianNB")

algModel = KNeighborsClassifier()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "KNeighborsClassifier")

algModel = DecisionTreeClassifier()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "DecisionTreeClassifier")

algModel = RandomForestClassifier()
getAccuracy_byKF(algModel, nbKF_min, nbKF_max, "RandomForestClassifier")

correlation with empty data of age

other


In [ ]:
titanicDFs = [train_df, test_df]

for dataset in combine:
titanicTrainClean = pd.DataFrame({
        "Survived": titanicTrain["Survived"]
    })
titanicTrainClean = pd.DataFrame({
        "Survived": titanicTrain["Survived"]
    })

In [ ]:
titanicTrainClean['FareBand'] = pd.qcut(titanicTrainClean['Fare'], 4)
titanicTrainClean[['FareBand', 'Survived']].groupby(['FareBand'], as_index=False).mean().sort_values(by='FareBand', ascending=True)

In [ ]:
titanicTrainClean.loc[ titanicTrainClean['Fare'] <= 7.91, 'Fare2'] = 0
titanicTrainClean.loc[(titanicTrainClean['Fare'] > 7.91) & (titanicTrainClean['Fare'] <= 14.454), 'Fare2'] = 1
titanicTrainClean.loc[(titanicTrainClean['Fare'] > 14.454) & (titanicTrainClean['Fare'] <= 31), 'Fare2']   = 2
titanicTrainClean.loc[ titanicTrainClean['Fare'] > 31, 'Fare2'] = 3
titanicTrainClean['Fare2'] = titanicTrainClean['Fare2'].astype(int)

In [ ]:
sliceDuration = 10
TempDS = pd.DataFrame({
        "AgeDiscrete": round(titanicTrainDS["Age"]//sliceDuration)*sliceDuration,
        "Survived": titanicTrainDS["Survived"]
    })
vectTemp = titanicTemp["AgeDiscrete"]
titanicTemp["AgeDiscrete"] = vectTemp.map(str) + "-" + (vectTemp + sliceDuration).map(str)

titanicTemp[["AgeDiscrete", "Survived"]].groupby(["AgeDiscrete"], as_index=False).mean()

In [ ]:
# Helper that return create all combinations of predictors
myList = predictorsAll
predictorCombinations = [] # all combination of predictord
for index in range(1, len(myList)+1):
    for subset in itertools.combinations(myList, index):
         predictorCombinations.append(list(subset))  
            
#predictorCombinations

make the prediction


In [ ]:
algModel = RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=2)
algModel = RandomForestClassifier(n_estimators=100)
algModel = RandomForestClassifier(n_estimators=100)
predictors = ['Pclass', 'Sex', 'Age', 'Parch', 'Fare']
predictors = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
result = binClassifModel_kf(titanicTrainDS[predictors], titanicTrainDS["Survived"], algModel, 5)
algList = result[0]

predictionsList = []
for alg in algList:
    predictions = alg.predict(titanicTestDS[predictors])
    predictionsList.append(predictions)

predictionsList2 = []
for alg in algList:
    predictions = alg.predict(titanicTrainDS[predictors])
    predictionsList2.append(predictions)
    

# There are different preditions, we take the mean (a voting-like system)
predictionsFinal = np.mean(predictionsList, axis=0)
predictionsFinal2 = np.mean(predictionsList2, axis=0)

# Map predictions to outcomes (the only possible outcomes are 1 and 0)
predictionsFinal[predictionsFinal > .5] = 1
predictionsFinal[predictionsFinal <=.5] = 0

In [ ]:
print(np.mean(predictionsList, axis=0).mean())
print(np.mean(predictionsList2, axis=0).mean())
print(titanicTrainDS["Survived"].mean())

In [ ]: