In [1]:
import numpy as np
import pandas as pd
%pylab inline
pylab.style.use('ggplot')
Description: Factors relating to presence/absence of a levee failure at a site on lower Mississippi River, with predictors:
Site underlain by coarse-grain channel fill (sediment)
Borrow pit indicator
Meander location (1=Inside bend, 2=outside bend, 3=chute, 4=straight)
channel width
floodway width
constriction factor
land cover type (1=open water, 2=grassy, 3=agricultural, 4=forest)
vegetative buffer width
channel sinuosity
dredging intensity
bank revetement
Variables/Columns:
Failure 8 /* 1=Yes, 0=No */
year 12-16
river mile 18-24
sediments 32
borrow pit 40
meander 48
channel width 50-56
floodway width 58-64
constriction factor 66-72
land cover 80
veg width 82-88
sinuosity 90-96
dredging 98-104
revetement 112
In [30]:
url = 'http://www.stat.ufl.edu/~winner/data/lmr_levee.dat'
data_df = pd.read_csv(url, sep='[\s]+', engine='python', header=None)
In [31]:
data_df.info()
In [32]:
data_df.head()
Out[32]:
In [23]:
col_str = """Failure
year
river mile
sediments
borrow pit
meander
channel width
floodway width
constriction factor
land cover
veg width
sinuosity
dredging
revetement"""
col_names = [c.replace(' ', '_').lower() for c in col_str.split('\n')]
In [34]:
data_df.columns = col_names
In [35]:
data_df.head()
Out[35]:
In [36]:
data_df['year'].value_counts()
Out[36]:
In [38]:
data_df['revetement'].value_counts()
Out[38]:
In [39]:
data_df['failure'].value_counts()
Out[39]:
In [40]:
data_df.plot(kind='scatter', x='year', y='failure')
Out[40]:
In [41]:
data_df.plot(kind='scatter', x='river_mile', y='failure')
Out[41]:
In [42]:
data_df.plot(kind='scatter', x='channel_width', y='failure')
Out[42]:
In [43]:
data_df.plot(kind='scatter', x='floodway_width', y='failure')
Out[43]:
In [44]:
data_df.plot(kind='scatter', x='veg_width', y='failure')
Out[44]:
In [45]:
data_df.plot(kind='scatter', x='sinuosity', y='failure')
Out[45]:
In [48]:
data_df.plot(kind='scatter', x='constriction_factor', y='failure')
Out[48]:
In [70]:
from sklearn.feature_selection import chi2, SelectKBest
cat_cols = data_df.dtypes[data_df.dtypes == np.int64].index
cat_cols = [c for c in cat_cols if c not in ('failure', 'dredging', 'revetement')]
target = data_df['failure']
cat_features = data_df[cat_cols]
selector = SelectKBest(chi2, k=5)
selector.fit(X=cat_features, y=target)
feature_imp = pd.Series(data=selector.scores_, index=cat_cols)
feature_imp.sort_values(ascending=False)
Out[70]:
In [73]:
t = data_df[['river_mile', 'channel_width', 'floodway_width', 'constriction_factor', 'veg_width', 'sinuosity']]
t.corrwith(data_df['failure'])
Out[73]:
In [78]:
import statsmodels.formula.api as sm
model = sm.logit(formula='failure ~ sinuosity + constriction_factor + C(borrow_pit) + C(land_cover)', data=data_df)
result = model.fit(method='lbfgs', maxiter=100)
result.summary()
Out[78]:
In [89]:
from sklearn.model_selection import KFold
fold = KFold(n_splits=5, shuffle=True)
formula='failure ~ sinuosity + constriction_factor + C(borrow_pit) + C(land_cover)'
def cross_validate(train_index, test_index):
train_data, test_data = data_df.iloc[train_index], data_df.iloc[test_index]
t_model = sm.logit(formula=formula, data=train_data)
t_result = t_model.fit(method='lbfgs', maxiter=100)
t_predict = t_result.predict(test_data)
t_predict = t_predict.apply(lambda v: 1.0 if v >= 0.5 else 0.0)
return pd.concat({'predicted': t_predict, 'actual': test_data['failure']}, axis=1)
cv_results = [cross_validate(train_index, test_index)
for train_index, test_index in fold.split(data_df.index)]
In [91]:
from sklearn.metrics import f1_score
scores = [f1_score(r['predicted'], r['actual']) for r in cv_results]
sc = pd.Series(data=scores, name='F1_scores')
sc.plot(kind='bar', title='5-fold cross validation results')
Out[91]: