In [1]:
    
import numpy as np
import pandas as pd
%pylab inline
pylab.style.use('ggplot')
    
    
Description: Factors relating to presence/absence of a levee failure at a site on lower Mississippi River, with predictors:
Site underlain by coarse-grain channel fill (sediment)
Borrow pit indicator
Meander location (1=Inside bend, 2=outside bend, 3=chute, 4=straight)
channel width
floodway width
constriction factor
land cover type (1=open water, 2=grassy, 3=agricultural, 4=forest)
vegetative buffer width
channel sinuosity 
dredging intensity
bank revetement
Variables/Columns:
Failure   8  /* 1=Yes, 0=No  */
year  12-16
river mile   18-24
sediments   32
borrow pit    40
meander     48
channel width 50-56
floodway width  58-64
constriction factor  66-72
land cover     80
veg width    82-88
sinuosity   90-96
dredging    98-104
revetement   112
In [30]:
    
url = 'http://www.stat.ufl.edu/~winner/data/lmr_levee.dat'
data_df = pd.read_csv(url, sep='[\s]+', engine='python', header=None)
    
In [31]:
    
data_df.info()
    
    
In [32]:
    
data_df.head()
    
    Out[32]:
In [23]:
    
col_str = """Failure
year
river mile
sediments
borrow pit
meander
channel width
floodway width
constriction factor
land cover
veg width
sinuosity
dredging
revetement"""
col_names = [c.replace(' ', '_').lower() for c in col_str.split('\n')]
    
In [34]:
    
data_df.columns = col_names
    
In [35]:
    
data_df.head()
    
    Out[35]:
In [36]:
    
data_df['year'].value_counts()
    
    Out[36]:
In [38]:
    
data_df['revetement'].value_counts()
    
    Out[38]:
In [39]:
    
data_df['failure'].value_counts()
    
    Out[39]:
In [40]:
    
data_df.plot(kind='scatter', x='year', y='failure')
    
    Out[40]:
    
In [41]:
    
data_df.plot(kind='scatter', x='river_mile', y='failure')
    
    Out[41]:
    
In [42]:
    
data_df.plot(kind='scatter', x='channel_width', y='failure')
    
    Out[42]:
    
In [43]:
    
data_df.plot(kind='scatter', x='floodway_width', y='failure')
    
    Out[43]:
    
In [44]:
    
data_df.plot(kind='scatter', x='veg_width', y='failure')
    
    Out[44]:
    
In [45]:
    
data_df.plot(kind='scatter', x='sinuosity', y='failure')
    
    Out[45]:
    
In [48]:
    
data_df.plot(kind='scatter', x='constriction_factor', y='failure')
    
    Out[48]:
    
In [70]:
    
from sklearn.feature_selection import chi2, SelectKBest
cat_cols = data_df.dtypes[data_df.dtypes == np.int64].index
cat_cols = [c for c in cat_cols if  c not in ('failure', 'dredging', 'revetement')]
target = data_df['failure']
cat_features = data_df[cat_cols]
selector = SelectKBest(chi2, k=5)
selector.fit(X=cat_features, y=target)
feature_imp = pd.Series(data=selector.scores_, index=cat_cols)
feature_imp.sort_values(ascending=False)
    
    Out[70]:
In [73]:
    
t = data_df[['river_mile', 'channel_width', 'floodway_width', 'constriction_factor', 'veg_width', 'sinuosity']]
t.corrwith(data_df['failure'])
    
    Out[73]:
In [78]:
    
import statsmodels.formula.api as sm
model = sm.logit(formula='failure ~ sinuosity + constriction_factor + C(borrow_pit) + C(land_cover)', data=data_df)
result = model.fit(method='lbfgs', maxiter=100)
result.summary()
    
    Out[78]:
In [89]:
    
from sklearn.model_selection import KFold
fold = KFold(n_splits=5, shuffle=True)
formula='failure ~ sinuosity + constriction_factor + C(borrow_pit) + C(land_cover)'
def cross_validate(train_index, test_index):
    train_data, test_data = data_df.iloc[train_index], data_df.iloc[test_index]
    
    t_model = sm.logit(formula=formula, data=train_data)
    t_result = t_model.fit(method='lbfgs', maxiter=100)
    t_predict = t_result.predict(test_data)
    t_predict = t_predict.apply(lambda v: 1.0 if v >= 0.5 else 0.0)
    
    return pd.concat({'predicted': t_predict, 'actual': test_data['failure']}, axis=1)
cv_results = [cross_validate(train_index, test_index) 
              for train_index, test_index in fold.split(data_df.index)]
    
In [91]:
    
from sklearn.metrics import f1_score
scores = [f1_score(r['predicted'], r['actual']) for r in cv_results]
sc = pd.Series(data=scores, name='F1_scores')
sc.plot(kind='bar', title='5-fold cross validation results')
    
    Out[91]: