notebook.community

Edit and run



In [1]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.io import savemat
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from jeeves.cross_validation import report_stratifiedKfold_cv
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import recall_score
from sklearn.externals.six import StringIO
from IPython.display import Image  
import pydot
%matplotlib inline



In [2]:

    
df = pd.read_table("tagged.tsv")



In [3]:

    
categoricals = []
for col in "section target_audience type".split():
    categoricals.append(pd.get_dummies(df[col]))
dfCat = pd.concat(categoricals, axis=1)



In [4]:

    
dfBool = df['speaker_link_present content_url_present'.split()].astype(int)



In [5]:

    
dfNum = df['n_votes n_comments deadlinediff'.split()]



In [6]:

    
feats = [dfCat, dfBool, dfNum]



In [7]:

    
X = pd.concat(feats, axis=1)
y = df['selected'].astype(int).values



In [8]:

    
savemat("trainData.mat", dict(X=X.values, y=y, cols=X.columns.values))



In [9]:

    
xabs = X.copy()
xabs['deadlinediff'] = xabs.deadlinediff.abs()



In [14]:

    
report_stratifiedKfold_cv(DecisionTreeClassifier(min_samples_leaf=2), xabs.values, y, shuffle=True, n_folds=10)









    Out[14]:






  
    
      
      accuracy_score
      recall_score
    
  
  
    
      0
      0.966667
      1.000000
    
    
      1
      0.866667
      0.571429
    
    
      2
      0.900000
      0.571429
    
    
      3
      0.800000
      0.714286
    
    
      4
      0.931034
      0.833333
    
    
      5
      0.821429
      0.666667
    
    
      6
      0.892857
      0.833333
    
    
      7
      0.892857
      0.833333
    
    
      8
      0.928571
      0.666667
    
    
      9
      0.821429
      0.333333



In [12]:

    
param_grid = dict(
    criterion=["entropy"], splitter="best random".split(),
    max_features=[None, "auto", "sqrt", "log2"],
    class_weight=["balanced", None],
    presort=[True, False],
    min_samples_leaf=np.arange(1, 6)
)
clf = DecisionTreeClassifier()
gcv = GridSearchCV(clf, param_grid=param_grid, n_jobs=-1)



In [13]:

    
gcv.fit(X.values, y)









    Out[13]:





GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'presort': [True, False], 'splitter': ['best', 'random'], 'criterion': ['entropy'], 'max_features': [None, 'auto', 'sqrt', 'log2'], 'class_weight': ['balanced', None], 'min_samples_leaf': array([1, 2, 3, 4, 5])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)



In [14]:

    
gcv.best_params_









    Out[14]:





{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_features': None,
 'min_samples_leaf': 2,
 'presort': False,
 'splitter': 'best'}



In [15]:

    
gcv.best_score_









    Out[15]:





0.87889273356401387



In [11]:

    
clf = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=2)



In [53]:

    
X_nondead = X[[c for c in X if c!="deadlinediff"]]



In [15]:

    
report_stratifiedKfold_cv(clf, xabs.values, y, n_folds=10).mean()









    Out[15]:





accuracy_score    0.863235
recall_score      0.695238
dtype: float64



In [16]:

    
dot_data = StringIO()  
export_graphviz(clf, out_file=dot_data,  
                feature_names=xabs.columns,  
                class_names=['rejected', 'selected'],  
                filled=True, rounded=True,  
                special_characters=True)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())









    Out[16]:



In [ ]:



In [17]:

    
preds = clf.predict(xabs.values)



In [18]:

    
from sklearn.metrics import recall_score, precision_score, precision_recall_curve
print recall_score(y, preds)
print precision_score(y, preds)









    



0.890625
0.934426229508



In [19]:

    
from sklearn.metrics import confusion_matrix



In [20]:

    
confusion_matrix(y, preds)









    Out[20]:





array([[221,   4],
       [  7,  57]])



In [23]:

    
from sklearn.metrics import roc_curve



In [24]:

    
roc_curve?



In [78]:

    
pp = clf.predict_proba(X.values)[:, 1]



In [79]:

    
fpr, tpr, _ = roc_curve(y, pp)



In [80]:

    
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, linewidth=8)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')









    Out[80]:





<matplotlib.text.Text at 0x11c090e50>



In [28]:

    
from sklearn.metrics import roc_auc_score



In [29]:

    
roc_auc_score(y, pp)









    Out[29]:





0.98802083333333335



In [30]:

    
precision_recall_curve?



In [31]:

    
p, r, _ = precision_recall_curve(y, pp)



In [32]:

    
plt.plot(p, r)









    Out[32]:





[<matplotlib.lines.Line2D at 0x118deba50>]



In [33]:

    
# Final Verification



In [41]:

    
df[X.deadlinediff > -25.5].selected.value_counts()









    Out[41]:





False    187
True       1
Name: selected, dtype: int64



In [46]:

    
df[X.deadlinediff > -44.5].selected.value_counts()









    Out[46]:





False    205
True      25
Name: selected, dtype: int64



In [ ]:

	accuracy_score	recall_score
0	0.966667	1.000000
1	0.866667	0.571429
2	0.900000	0.571429
3	0.800000	0.714286
4	0.931034	0.833333
5	0.821429	0.666667
6	0.892857	0.833333
7	0.892857	0.833333
8	0.928571	0.666667
9	0.821429	0.333333