In [33]:
    
import pandas as pd
from autoc import DataExploration, PreProcessor, NaImputer
from autoc.utils.getdata import get_dataset
import numpy as np
# skicit learn 
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score,train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import roc_curve, accuracy_score, auc, classification_report
# matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
    
Two approaches :
In [34]:
    
titanic = get_dataset("titanic")
    
In [35]:
    
titanic.who.dtype.kind == 'O'
    
    Out[35]:
In [36]:
    
titanic.head()
    
    Out[36]:
In [37]:
    
exploration_titanic = DataExploration(titanic)
    
In [38]:
    
exploration_titanic.print_infos() # there is duplicates here because no id, interesting !
    
    
In [39]:
    
exploration_titanic.nacolcount()
    
    Out[39]:
In [40]:
    
exploration_titanic.structure()
    
    Out[40]:
In [41]:
    
titanic.corr()
    
    Out[41]:
In [42]:
    
titanic.loc[titanic.age.isnull(),:].head(5)
    
    Out[42]:
In [43]:
    
preprocessor = PreProcessor(titanic)
    
In [44]:
    
preprocessor.infer_subtypes()
    
    Out[44]:
In [45]:
    
titanic = titanic.drop('alive', axis = 1)
    
The dataset is not clean enough to be directly transformed as a numpy array and used for ml with skicit learn
In [46]:
    
features_full = pd.concat([titanic.loc[:, ['fare', 'age', 'pclass', 'sibsp', 'parch']],
                      pd.get_dummies(titanic['sex'], prefix='sex'),
                      pd.get_dummies(titanic['who'], prefix='who'),
                      pd.get_dummies(titanic['alone'], prefix='alone'),
                      pd.get_dummies(titanic['embarked'], prefix='embarked')],
                     axis=1)
    
In [47]:
    
features = pd.concat([titanic[['fare', 'age', 'pclass']],
                      pd.get_dummies(titanic['sex'], prefix='sex'),
                      pd.get_dummies(titanic['who'], prefix='who'),
                      pd.get_dummies(titanic['embarked'], prefix='embarked')],
                     axis=1)
    
In [48]:
    
target = titanic.survived
    
In [49]:
    
# Impute missing values 
imp = NaImputer(features_full)
features_full = imp.basic_naimputation(['age']) # this is still a pandas Dataframe but imputed 
target = titanic.survived
    
In [50]:
    
# Creating test train 
features_train, features_test, target_train, target_test = train_test_split(
    features_full.values, target.values, test_size=0.25, random_state=0)
    
In [51]:
    
logreg = LogisticRegression(C=1)
logreg.fit(features_train, target_train)
target_pred = logreg.predict(features_test)
feature_names = features_full.columns
print("Accuracy : {}".format(accuracy_score(target_test, target_pred)))
weights = logreg.coef_.flatten()
dict_weights = {k:v for k,v in zip(feature_names, weights)}
    
    
In [52]:
    
def plot_simple_imp(imp, features_names,sort = True, absolute=False):
    serie = pd.Series(index=feature_names, data=imp)
    if absolute : 
        serie = np.abs(serie)
    if sort :
        serie.sort_values(inplace=True, ascending=False)
    serie.plot(kind='barh')
    
In [53]:
    
plot_simple_imp(weights, feature_names)
    
    
In [ ]:
    
    
In [54]:
    
# Looking at weights 
feature_names = features_full.columns
def plot_abs_weights(coeff_arr, features_name, title=None,legend_size=12,figsize=(15,7)):
    coeff_arr = np.abs(coeff_arr)# take absolute value
    coeff_arr.sort()
    plt.figure(figsize=figsize)
    plt.barh(range(len(feature_names)), coeff_arr)
    plt.yticks(range(len(feature_names)), feature_names, size=legend_size)
    if title:
        plt.title(title)
    
In [55]:
    
plot_abs_weights(logreg.coef_.ravel(), feature_names, title="Absolute Coefficient Logistic Regression")
    
    
In [56]:
    
rf_full = RandomForestClassifier(n_estimators=500)
rf_full.fit(features_train, target_train)
    
    Out[56]:
In [57]:
    
rf_full.score(features_test, target_test)
    
    Out[57]:
In [58]:
    
plot_simple_imp(rf_full.feature_importances_, feature_names)
    
    
Here we are trying to predict who survives using with the variable age having natural missing values
In [130]:
    
def rf_cv(features, target,random_state=1, n_estimators=200,scoring='accuracy',n_jobs=4, verbose=True):
    """ Print scores of a random forest cross validation  """
    rf = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    scores = cross_val_score(rf, features, target, cv=4,scoring=scoring,n_jobs=4)
    if verbose :
        print("Random Forest CV scores:min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
                scores.min(), scores.mean(), scores.max()))
    return scores
def logreg_cv(features, target, scoring='accuracy',n_jobs=4, verbose=True):
    """ Print scores of a  forest cross validation  """
    logreg = LogisticRegression(C=1)
    scores = cross_val_score(logreg, features, target, cv=4,scoring=scoring,n_jobs=4)
    if verbose : 
        print("Logistic Regression CV scores: min: {:.3f}, mean: {:.3f}, max: {:.3f}".format(
                scores.min(), scores.mean(), scores.max()))
    return scores
    
def plot_roc_curve(target_test, target_predicted_proba):
    fpr, tpr, thresholds = roc_curve(target_test, target_predicted_proba[:, 1])
    
    roc_auc = auc(fpr, tpr)
    # Plot ROC curve
    plt.plot(fpr, tpr, label='ROC curve (AUC = %0.3f)' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate or (1 - Specifity)')
    plt.ylabel('True Positive Rate or (Sensitivity)')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    
In [131]:
    
# selecting index and transforming into numpy array 
index_missing_age = features.age.isnull()
features_rm_a, target_rm_a = features.loc[~index_missing_age, :].values, target[~index_missing_age].values
    
In [132]:
    
features_rm_a.shape
    
    Out[132]:
In [133]:
    
rf_cv(features_rm_a, target_rm_a, scoring='accuracy')
    
    
    Out[133]:
In [134]:
    
features_train_rm, features_test_rm, target_train_rm, target_test_rm = train_test_split(
    features_rm_a, target_rm_a, test_size=0.25, random_state=0)
    
In [135]:
    
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_rm, target_train_rm)
target_predicted_proba = rf.predict_proba(features_test_rm)
plot_roc_curve(target_test_rm, target_predicted_proba)
    
    
In [136]:
    
rf_cv(features_rm_a, target_rm_a,random_state=0, scoring='roc_auc')
    
    
    Out[136]:
In [137]:
    
# selecting index and transforming into numpy array 
features_cm_a, target_cm_a = features.drop('age', axis =1).values, target.values
    
In [138]:
    
features_cm_a.shape
    
    Out[138]:
In [139]:
    
rf_cv(features_cm_a, target_cm_a, scoring="accuracy")
    
    
    Out[139]:
In [140]:
    
features_train_cm, features_test_cm, target_train_cm, target_test_cm = train_test_split(
    features_cm_a, target_cm_a, test_size=0.25, random_state=0)
    
In [141]:
    
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_cm, target_train_cm)
target_predicted_proba = rf.predict_proba(features_test_cm)
plot_roc_curve(target_test_cm, target_predicted_proba)
    
    
In [142]:
    
rf_cv(features_cm_a, target_cm_a, scoring="roc_auc")
    
    
    Out[142]:
In [143]:
    
# selecting index and transforming into numpy array 
features_imp = features.copy()
features.shape
    
    Out[143]:
In [144]:
    
# features_imp.loc[:,'is_na_age'] =features_imp.age.isnull().astype(int)
# imp = NaImputer(features) # creating our imputer instance
# features_imp = imp.basic_naimputation(columns_to_process=['age'])
    
In [145]:
    
features_imp = features.fillna(-1)
    
In [146]:
    
features_imp_a, target_imp_a = features_imp.values, target.values
    
In [147]:
    
rf_cv(features_imp_a, target_imp_a, scoring='accuracy')
    
    
    Out[147]:
In [148]:
    
features_train_imp, features_test_imp, target_train_imp, target_test_imp = train_test_split(
    features_imp_a, target_imp_a, test_size=0.25, random_state=0)
    
In [149]:
    
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_train_imp, target_train_imp)
target_predicted_proba = rf.predict_proba(features_test_imp)
plot_roc_curve(target_test_imp, target_predicted_proba)
    
    
In [150]:
    
rf_cv(features_imp_a, target_imp_a, scoring='roc_auc')
    
    
    Out[150]:
In [151]:
    
rf.feature_importances_
    
    Out[151]:
The purpose of this section is to simulate missing values in a important variable like .. and see the decrease of performance and the ability to correcly impute the missing value
In [152]:
    
# constructing features 
features_imp = pd.concat([titanic[['pclass']],
                      pd.get_dummies(titanic['sex'], prefix='sex'),
                      pd.get_dummies(titanic['who'], prefix='who')],axis=1)
    
In [153]:
    
features_imp.pclass.value_counts()
    
    Out[153]:
In [156]:
    
#scores_imp = logreg_cv(features_imp.drop('pclass',1), target)
    
In [174]:
    
def insert_na(features_full=features_imp,target=target,index=False,
              col_to_simulate='pclass', pct_na_toinsert=0.2, verbose=False):
    """ Returns dataset with a certain pct of na injected in one colum """
    nb_na_toinsert = int(pct_na_toinsert * len(features_full))
    index_na_toinsert = np.random.choice(range(len(features_full)),nb_na_toinsert, replace=False)
    if verbose:
        print("We are inserting {} missing values".format(len(index_na_toinsert)))
    features_full_imp = features_full.copy()
    if index :
        return index_na_toinsert
    else:
        features_full_imp.loc[index_na_toinsert, col_to_simulate] = np.nan
        return features_full_imp
def score_rf_sim(features_full=features_imp,target=target,
                 col_to_simulate='pclass', pct_na_toinsert=0.2, n_repeat=10,verbose=False, *args, **kwargs):
    """ Inserting a percentage of missing values on a variable and look influence on performance 
    with a random forest model """
    features_full_imp = insert_na(features_full,target=target,
                                  col_to_simulate=col_to_simulate, pct_na_toinsert=pct_na_toinsert,verbose=verbose)
    imp_f = NaImputer(features_full_imp)
    features_full_imp.loc[:,col_to_simulate] = imp_f.fillna_serie(colname=col_to_simulate)
    # repeated cross validation 
#     score_rcv = 0
#     for i in range(n_repeat):
#         score_rcv += logreg_cv(features_full_imp, target,*args, **kwargs).mean()
    return logreg_cv(features_full_imp, target, verbose=False).mean()
    
In [175]:
    
score_rf_sim(col_to_simulate='pclass', verbose=True)
    
    
    Out[175]:
In [195]:
    
accuracy_mean_pct_na = np.array([score_rf_sim(
            pct_na_toinsert=i,col_to_simulate='pclass',verbose=True) for i in np.linspace(0,0.98,10)])
    
    
In [179]:
    
def sim_nmc(nmc=60,n_interval=5, *args, **kwargs):
    
    res = np.zeros(n_interval)
    for i in range(nmc):
        res +=  np.array([score_rf_sim(
            pct_na_toinsert=i, *args, **kwargs) for i in np.linspace(0,0.98,n_interval)])
    return res/nmc
    
In [180]:
    
test = sim_nmc(nmc=30, n_interval=5)
    
In [181]:
    
test
    
    Out[181]:
In [182]:
    
np.linspace(0,0.98,5)
    
    Out[182]:
In [183]:
    
plt.plot(np.linspace(0,0.98,5), test)
plt.title('Accuracy function of percentage of missing values inserted')
    
    Out[183]:
    
We start with all features to have better prediciton power
In [184]:
    
features_pred = features_full.copy().drop_duplicates()
features_pred = features_pred.drop('age', axis = 1)
    
In [185]:
    
index_na = insert_na(col_to_simulate='pclass', pct_na_toinsert=0.2, index=True)
    
In [186]:
    
index_na = features_pred.index.isin(index_na)
    
In [187]:
    
features_pred.head()
    
    Out[187]:
In [188]:
    
target = features_pred.pclass
features_pred = features_pred.drop('pclass', axis = 1)
    
In [189]:
    
features_pred_train, target_pred_target = features_pred.loc[~index_na,:], target[~index_na]
features_pred_test, target_pred_test = features_pred.loc[index_na,:], target[index_na]
    
In [190]:
    
features_pred_train.shape
    
    Out[190]:
In [191]:
    
features_pred_test.head()
    
    Out[191]:
In [192]:
    
rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_pred_train, target_pred_target)
target_predicted = rf.predict(features_pred_test)
target_predicted_proba = rf.predict_proba(features_pred_test)
    
In [194]:
    
print(classification_report(target_pred_test, target_predicted))