notebook.community

Edit and run



In [1]:

    
import sys
import re
import time
import datetime
import pandas as pd
import numpy as np
import func
# inline plot
import matplotlib.pyplot as plt
%matplotlib inline









    



/Users/albarron/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [2]:

    
df = pd.read_csv("data/merged_concat_final.csv",sep='\t',error_bad_lines=False)
del df['Unnamed: 0']
print df.shape









    



(21021, 33)



In [5]:

    
df.set_index('timestamp')
df['timestamp'] = pd.to_datetime(df['timestamp'],format="%Y-%m-%d %H:%M:%S")



In [6]:

    
only_delay = df[(df['is_delay']==1)]



In [326]:

    
len(only_delay)









    Out[326]:





327



In [7]:

    
print "relative to delay"
print (df[['is_delay','del_min','del_med','del_maj','del_cat']].sum()/float(df['is_delay'].sum()))*100 , '%'
print "Relative to total"
print (df[['is_delay','del_min','del_med','del_maj','del_cat']].sum()/float(len(df)))*100 , '%'









    



relative to delay
is_delay    100.000000
del_min      27.828746
del_med      43.425076
del_maj       8.562691
del_cat       4.892966
dtype: float64 %
Relative to total
is_delay    1.555587
del_min     0.432900
del_med     0.675515
del_maj     0.133200
del_cat     0.076114
dtype: float64 %



In [8]:

    
# Train IDs swapped into cat variables and concat into main dataset
train_id_dummies = pd.get_dummies(df['train_id'],prefix='tid')
train_id_dummies.shape
train_id_dummies.columns.values
del train_id_dummies['tid_101.0'] # Delete as base var
tid_col = train_id_dummies.columns.values
df = pd.concat([df, train_id_dummies], axis=1)

Pick one of these to explore re: below models



In [73]:

    
# Including train IDs
features = df.columns.values
target_cols = ['temp','precipiation',
        'visability','windspeed','humidity','cloudcover',
        'is_bullet','is_limited','t_northbound',
       'd_tuesday','d_wednesday','d_thursday','d_friday',] + list(tid_col)
X = df[target_cols]
# del X['is_delay']
# del X['tweet_id']
# X['timestamp'] = X['timestamp'].apply(lambda x: (np.datetime64(x).astype('uint64') / 1e6).astype('uint32'))
# y = df['ord_del']
y = df['is_delay']



In [ ]:

    
# Including train IDs
features = df.columns.values
target_cols = ['temp','precipiation',
        'visability','windspeed','humidity','cloudcover',
        'is_bullet','is_limited','t_northbound',
       'd_tuesday','d_wednesday','d_thursday','d_friday',] + list(tid_col)
X = df[target_cols]
# del X['is_delay']
# del X['tweet_id']
# X['timestamp'] = X['timestamp'].apply(lambda x: (np.datetime64(x).astype('uint64') / 1e6).astype('uint32'))
# y = df['ord_del']
y = df['is_delay']



In [74]:

    
# X['timestamp'] = X['timestamp'].apply(lambda x:int(x))
# X['stop_pa'] = X['stop_pa'].apply(lambda x:int(x))
# X['train_id'] = X['train_id'].apply(lambda x:int(x))
X['t_northbound'] = X['t_northbound'].apply(lambda x:int(x))
X['cloudcover'] = X['cloudcover'].fillna(X['cloudcover'].mean())









    



/Users/albarron/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/albarron/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Train/Test split



In [289]:

    
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.25)

Run Decision Trees, Prune, and consider False Positives



In [76]:

    
from sklearn.tree import DecisionTreeClassifier
TreeClass = DecisionTreeClassifier(
                max_depth = 2,
                min_samples_leaf = 5)
TreeClass.fit(X_train,y_train)









    Out[76]:





DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')



In [394]:

    
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
scores = cross_val_score(TreeClass, X, y, cv=10)
print(scores.mean()) # Score = More is better, error is 1-score









    



0.992055614405



In [395]:

    
Depth = range(1,10)
score = []
for i in Depth:
      TreeClass = DecisionTreeClassifier(
                max_depth = i,
                min_samples_leaf = 5)
      scores = cross_val_score(TreeClass, X, y, cv=10)
      score.append(np.mean(scores))

Depth_Choice_df = pd.DataFrame({'cv_scores': score ,'Depth': Depth})
Depth_Choice_df.plot(x ='Depth',y = 'cv_scores' )

#It seems like at depth = 4 we achieved the best result









    Out[395]:





<matplotlib.axes._subplots.AxesSubplot at 0x11cbd5650>



In [80]:

    
Depth_Choice_df.head()



In [ ]:

    
TreeClass = DecisionTreeClassifier(
                max_depth = 2,
                min_samples_leaf = 5)



In [405]:

    
TreeClass.fit(X_train,y_train)
ImportanceDataFrame = pd.DataFrame({'feature':X.columns.values, 'Gini_Import':TreeClass.feature_importances_})
ImportanceDataFrame.sort_values(by = ['Gini_Import'],ascending = 0).head(10)









    Out[405]:






  
    
      
      Gini_Import
      feature
    
  
  
    
      7
      0.521470
      is_limited
    
    
      6
      0.293549
      is_bullet
    
    
      0
      0.051288
      temp
    
    
      5
      0.033127
      cloudcover
    
    
      4
      0.032764
      humidity
    
    
      3
      0.016853
      windspeed
    
    
      10
      0.010351
      d_wednesday
    
    
      2
      0.008541
      visability
    
    
      17
      0.006534
      tid_135.0
    
    
      12
      0.005773
      d_friday



In [94]:

    
df[['temp','precipiation','visability','windspeed','humidity','cloudcover']].corr()









    Out[94]:






  
    
      
      temp
      precipiation
      visability
      windspeed
      humidity
      cloudcover
    
  
  
    
      temp
      1.000000
      -0.209248
      0.119720
      0.333041
      -0.345526
      -0.158742
    
    
      precipiation
      -0.209248
      1.000000
      -0.459115
      0.200260
      0.303568
      0.365534
    
    
      visability
      0.119720
      -0.459115
      1.000000
      -0.119827
      -0.427823
      -0.343510
    
    
      windspeed
      0.333041
      0.200260
      -0.119827
      1.000000
      -0.004015
      0.252678
    
    
      humidity
      -0.345526
      0.303568
      -0.427823
      -0.004015
      1.000000
      0.487529
    
    
      cloudcover
      -0.158742
      0.365534
      -0.343510
      0.252678
      0.487529
      1.000000



In [83]:

    
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#example-model-selection-plot-confusion-matrix-py
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ['No Delay','Delay'], rotation=45)
    plt.yticks(tick_marks, ['No Delay','Delay'])
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')



In [400]:

    
from sklearn.metrics import confusion_matrix
TreeClass.fit(X_train,y_train)
y_hat = TreeClass.predict(X_test)
cmat = confusion_matrix(y_test, y_hat)
c_norm = cmat.astype('float') / cmat.sum(axis=1)[:, np.newaxis]
print "Confusion matrix:"
print cmat
print "Normalized"
print np.round(c_norm,4)
plot_confusion_matrix(c_norm) # Normalized
print "Accuracy"
print np.round((cmat[1][1] + cmat[0][0]) / float(len(X_test)),4),'%'
print "Misclass"
print np.round((cmat[1][0] + cmat[0][1]) / float(len(X_test)),4),'%'
print "Sensitivity"
print np.round((cmat[1][1]) / float(cmat[1][1]+cmat[1][0]),4),'%'
print "Specificity"
print np.round((cmat[0][0]) / float(cmat[0][0]+cmat[0][1]),4),'%'









    



Confusion matrix:
[[5164   12]
 [  41   39]]
Normalized
[[ 0.9977  0.0023]
 [ 0.5125  0.4875]]
Accuracy
0.9899 %
Misclass
0.0101 %
Sensitivity
0.4875 %
Specificity
0.9977 %



In [401]:

    
y_hat_probability = TreeClass.predict_proba(X_test).T[1] 
print(y_hat_probability)
print(roc_auc_score(y_test, y_hat_probability))
vals = roc_curve(y_test, y_hat_probability)  
Roc_DataFrame = pd.DataFrame({'False_Positive_Rate':vals[0],'True_Positive_Rate':vals[1]})
Roc_DataFrame.plot(x = 'False_Positive_Rate' , y = 'True_Positive_Rate' )









    



[ 0.00188127  0.00188127  0.00188127 ...,  0.00188127  0.00188127
  0.00188127]
0.837881568779






    Out[401]:





<matplotlib.axes._subplots.AxesSubplot at 0x126ed9b90>



In [390]:

    
from sklearn.metrics import roc_curve, auc,roc_auc_score
TreeClass.fit(X_train,y_train)
y_hat_probability = TreeClass.predict_proba(X_test).T[1] 
print(y_hat_probability)
print(roc_auc_score(y_test, y_hat_probability))
vals = roc_curve(y_test, y_hat_probability)  
Roc_DataFrame = pd.DataFrame({'False_Positive_Rate':vals[0],'True_Positive_Rate':vals[1]})
Roc_DataFrame.plot(x = 'False_Positive_Rate' , y = 'True_Positive_Rate' )









    



[ 0.00577886  0.00577886  0.00577886 ...,  0.00577886  0.00577886
  0.00577886]
0.805011108964






    Out[390]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a77ab50>

As a check, consider Feature selection



In [404]:

    
from sklearn import feature_selection
pvals = feature_selection.f_regression(X,y)[1] 
pd.DataFrame(sorted(zip(X.columns.values,np.round(pvals,4)),key=lambda x:x[1],reverse=True),columns=['Feature','Value']).head(5)



In [60]:

    
from sklearn.linear_model import LogisticRegression
lm = LogisticRegression()
lm.fit(X,y)
X_lr=df[['windspeed','t_northbound','precipiation','d_friday']]
# localize your search around the maximum value you found
c_list = np.logspace(-1,1,21) 
c_index = np.linspace(-1,1,21)
#C is just the inverse of Lambda - the smaller the C - the stronger the
#regulatization. The smaller C's choose less variables
cv_scores = []
for c_score in c_list:
    lm = LogisticRegression(C = c_score, penalty = "l1")
    cv_scores.append(cross_val_score(lm,X,y,cv=10).mean())


C_Choice_df = pd.DataFrame({'cv_scores': cv_scores ,'Log_C': c_index })
C_Choice_df.plot(x ='Log_C',y = 'cv_scores' )
# it sounds like our best choice is C = -0.1  (we chose the most restrictive option)









    Out[60]:





<matplotlib.axes._subplots.AxesSubplot at 0x10e80f150>



In [100]:

    
X.head()









    Out[100]:






  
    
      
      temp
      precipiation
      visability
      windspeed
      humidity
      cloudcover
      is_bullet
      is_limited
      t_northbound
      d_tuesday
      ...
      tid_365.0
      tid_366.0
      tid_370.0
      tid_371.0
      tid_375.0
      tid_376.0
      tid_380.0
      tid_381.0
      tid_385.0
      tid_386.0
    
  
  
    
      0
      72.97875
      0
      10
      6.68125
      0.62875
      0.09875
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      72.97875
      0
      10
      6.68125
      0.62875
      0.09875
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      72.97875
      0
      10
      6.68125
      0.62875
      0.09875
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      72.97875
      0
      10
      6.68125
      0.62875
      0.09875
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      72.97875
      0
      10
      6.68125
      0.62875
      0.09875
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 103 columns

Run for Boosting



In [195]:

    
from sklearn.ensemble import GradientBoostingClassifier
GBC_Tree = GradientBoostingClassifier(learning_rate = 0.01,
                                      n_estimators = 10000,
                                      max_depth = 2,
                                      min_samples_leaf = 5)
 

from sklearn import cross_validation
kf = cross_validation.KFold(len(df), n_folds = 10, shuffle = True) #10 fold CV
scores = []
for train_index, test_index in kf:        
        GBC_Tree.fit(X.iloc[train_index], y.iloc[train_index])
        y_hat_test = GBC_Tree.predict(X.iloc[test_index]) 
        scores.append(float(sum(y_hat_test == y.iloc[test_index]))/len(y_hat_test))

Score_GBC_CV = np.mean(scores)    

print(Score_GBC_CV)









    



0.991484888834

Training the Boosting classifier



In [361]:

    
Score = []
NumberOfTrees = [100,1000,5000,10000,20000]
for i in NumberOfTrees:
    GBR_Tree = GradientBoostingClassifier(learning_rate = 0.01,
                                 n_estimators = i,
                                 max_depth = 2,
                                 min_samples_leaf = 5  )  
    scores = cross_val_score(GBR_Tree, X, y, cv=10, scoring='mean_squared_error')
    Score.append(np.mean(np.sqrt(-scores)))

Depth_Choice_df = pd.DataFrame({'CV_Error': Score ,'Number Of Trees': NumberOfTrees})
Depth_Choice_df.plot(x ='Number Of Trees',y = 'CV_Error' )









    Out[361]:





<matplotlib.axes._subplots.AxesSubplot at 0x12ebc33d0>



In [360]:

    
Depth = range(1,5)
Score = []
for i in Depth:
    GBR_Tree = GradientBoostingClassifier(learning_rate = 0.01,
                                 n_estimators = 1000,
                                 max_depth = i,
                                 min_samples_leaf = 5  )  
    scores = cross_val_score(GBR_Tree, X, y, cv=10, scoring='mean_squared_error')
    Score.append(np.mean(np.sqrt(-scores)))

Depth_Choice_df = pd.DataFrame({'CV_Error': Score ,'Max_Depth': Depth})
Depth_Choice_df.plot(x ='Max_Depth',y = 'CV_Error' )









    Out[360]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a7e12d0>



In [362]:

    
Depth = range(1,5)
Score = []
for i in Depth:
    GBR_Tree = GradientBoostingClassifier(learning_rate = 0.01,
                                 n_estimators = 10000,
                                 max_depth = i,
                                 min_samples_leaf = 5  )  
    scores = cross_val_score(GBR_Tree, X, y, cv=10, scoring='mean_squared_error')
    Score.append(np.mean(np.sqrt(-scores)))

Depth_Choice_df = pd.DataFrame({'CV_Error': Score ,'Max_Depth': Depth})
Depth_Choice_df.plot(x ='Max_Depth',y = 'CV_Error' )









    Out[362]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a7465d0>

Tuned Tree and running model



In [363]:

    
# TUNED TREE
GBR_Tree = GradientBoostingClassifier(learning_rate = 0.01,
                                     n_estimators = 1000,
                                     max_depth = 1,
                                     min_samples_leaf = 5  )



In [365]:

    
GBC_Tree.fit(X_train,y_train)
ImportanceDataFrame = pd.DataFrame({'feature':X.columns.values, 'Importance':GBC_Tree.feature_importances_})
ImportanceDataFrame.sort_values(by = ['Importance'],ascending = 0).head(10)









    Out[365]:






  
    
      
      Importance
      feature
    
  
  
    
      0
      0.169679
      temp
    
    
      4
      0.167738
      humidity
    
    
      5
      0.152647
      cloudcover
    
    
      3
      0.149600
      windspeed
    
    
      2
      0.083236
      visability
    
    
      12
      0.037309
      d_friday
    
    
      7
      0.028038
      is_limited
    
    
      9
      0.025530
      d_tuesday
    
    
      1
      0.021733
      precipiation
    
    
      11
      0.019582
      d_thursday



In [397]:

    
from sklearn.metrics import confusion_matrix
y_hat = GBC_Tree.predict(X_test)
cmat = confusion_matrix(y_test, y_hat)
c_norm = cmat.astype('float') / cmat.sum(axis=1)[:, np.newaxis]
print "Confusion matrix:"
print cmat
print "Normalized"
print np.round(c_norm,4)
plot_confusion_matrix(c_norm) # Normalized
print "Accuracy"
print np.round((cmat[1][1] + cmat[0][0]) / float(len(X_test)),4),'%'
print "Misclass"
print np.round((cmat[1][0] + cmat[0][1]) / float(len(X_test)),4),'%'
print "Sensitivity"
print np.round((cmat[1][1]) / float(cmat[1][1]+cmat[1][0]),4),'%'
print "Specificity"
print np.round((cmat[0][0]) / float(cmat[0][0]+cmat[0][1]),4),'%'









    



Confusion matrix:
[[5166   10]
 [  38   42]]
Normalized
[[ 0.9981  0.0019]
 [ 0.475   0.525 ]]
Accuracy
0.9909 %
Misclass
0.0091 %
Sensitivity
0.525 %
Specificity
0.9981 %



In [392]:









    



Accuracy
0.248 %
Misclass
0.002 %
Sensitivity
0.6125 %
Specificity
0.9977 %

Prediction Prototype



In [202]:

    
import forecastio
keys = pd.read_csv('keys.csv') # hidden from github
forecaseiokey=keys.iloc[5].string
api_key = forecaseiokey
lat = 37.441879
lng = -122.143021



In [204]:

    
print current.d









    



{u'ozone': 364.43, u'temperature': 64.79, u'icon': u'clear-day', u'dewPoint': 49.88, u'humidity': 0.58, u'visibility': 9.92, u'summary': u'Clear', u'apparentTemperature': 64.79, u'pressure': 1015.48, u'windSpeed': 6.9, u'cloudCover': 0.12, u'time': 1459377043, u'windBearing': 321, u'precipIntensity': 0, u'precipProbability': 0}



In [324]:

    
def get_train_prediction(trainid,threshold=0.2):
    The_train = str(trainid)

    time = datetime.datetime.now()
    forecast = forecastio.load_forecast(api_key, lat, lng,time=time)
    current = forecast.currently()

    Predict_me = pd.DataFrame({
            'temp': [current.d['apparentTemperature']],
            'precipiation': [current.d['precipIntensity']],
            'visability': [current.d['visibility']],
            'windspeed': [current.d['windSpeed']],
            'humidity': [current.d['humidity']],
            'cloudcover': [current.d['cloudCover']],
            'is_bullet': 1 if str(The_train)[0] == '2' else 0,
            'is_limited':1 if str(The_train)[0] == '3' else 0,
            't_northbound': int(The_train)%2,
            'd_tuesday': 1 if time.weekday == 1 else 0,
            'd_wednesday':1 if time.weekday == 2 else 0,
            'd_thursday':1 if time.weekday == 3 else 0,
            'd_friday':1 if time.weekday == 4 else 0
            })


    t = pd.DataFrame(columns=tid_col)
    t = t.append([0]).fillna(0)
    t['tid_'+The_train+'.0'] = 1
    t['tid_'+The_train+'.0']
    Predict_me = pd.concat([Predict_me, t],axis=1)
    del Predict_me[0]
    pprob = GBC_Tree.predict_proba(Predict_me).T
    pred = 1 if pprob[1] >= threshold else 0 # GBC_Tree.predict(Predict_me)
    print "Will be delayed:",pred
    print "Probability:",np.round(pprob.T,4)
    return [pred,pprob]



In [383]:

    
get_train_prediction(381)









    



Will be delayed: 0
Probability: [[ 0.  1.]]







    Out[383]:





[0, array([[  3.91950517e-05],
        [  9.99960805e-01]])]

Hamed Feedback

Set threshold to 70% for predibility
Tolerance is only 5% delay mispredict
Look at lecture 9



In [412]:

    
def PredictThreshold(Predictprob,Threshhold):
    y_predict = 0
    if (Predictprob >= Threshhold):
        y_predict = 1
    return y_predict


y_hat_probability = GBC_Tree.predict_proba(X_test).T[1]
y_hat_predict_threshold = []
threshold = 0.1# Changing down increases false positive, changing up increases false negative
for i in range(0,len(y_hat_probability)):
    y_hat_predict_threshold.append(PredictThreshold(y_hat_probability[i],threshold))

cmat = confusion_matrix(y_test, y_hat_predict_threshold)
c_norm = cmat.astype('float') / cmat.sum(axis=1)[:, np.newaxis]
print "Confusion matrix:"
print cmat
print "Normalized"
print np.round(c_norm,4)
plot_confusion_matrix(c_norm) # Normalized
print "Accuracy"
print np.round((cmat[1][1] + cmat[0][0]) / float(len(X_test)),4),'%'
print "Misclass"
print np.round((cmat[1][0] + cmat[0][1]) / float(len(X_test)),4),'%'
print "Sensitivity"
print np.round((cmat[1][1]) / float(cmat[1][1]+cmat[1][0]),4),'%'
print "Specificity"
print np.round((cmat[0][0]) / float(cmat[0][0]+cmat[0][1]),4),'%'









    



Confusion matrix:
[[5130   46]
 [  29   51]]
Normalized
[[ 0.9911  0.0089]
 [ 0.3625  0.6375]]
Accuracy
0.9857 %
Misclass
0.0143 %
Sensitivity
0.6375 %
Specificity
0.9911 %



In [370]:

    
# GBC_Tree = GradientBoostingClassifier(learning_rate = 0.01,
#                                       n_estimators = 10000,
#                                       max_depth = 3,
#                                       min_samples_leaf = 5)
# GBC_Tree.fit(X_train,y_train)
cmatrix = []
y_hat_probability = GBC_Tree.predict_proba(X_test).T[1]
for t in range(0,100,1):
    y_hat_predict_threshold = []
    cm = []
    thresh = t/100.0
    for i in range(0,len(y_hat_probability)):
        y_hat_predict_threshold.append(PredictThreshold(y_hat_probability[i],thresh))
    cm = confusion_matrix(y_test, y_hat_predict_threshold)
    cmatrix.append([thresh,cm[0][0],cm[0][1],cm[1][0],cm[1][1]])
thmatrix = pd.DataFrame(cmatrix, columns=['Thresh','T_ND','F_ND','F_D','T_D'])



In [371]:

    
thmatrix.set_index('Thresh')
thmatrix.plot(x='Thresh',y=['T_ND','F_ND','F_D','T_D'],kind='line', ylim=(0,100),xlim=(0.,1.))









    Out[371]:





<matplotlib.axes._subplots.AxesSubplot at 0x12dd6d490>



In [372]:

    
thmatrix['acc'] = thmatrix.apply(lambda x:(x.T_D + x.T_ND) / float(len(thmatrix)),axis=1)
thmatrix['miss'] = thmatrix.apply(lambda x:(x.F_D + x.F_ND) / float(len(thmatrix)),axis=1)
thmatrix['sens'] = thmatrix.apply(lambda x:(x.T_D) / float(x.T_D+x.F_D),axis=1)
thmatrix['spec'] = thmatrix.apply(lambda x:(x.T_ND / float(x.T_ND+x.F_ND)),axis=1)



In [373]:

    
thmatrix.plot(x='Thresh',y=['acc','sens','miss','spec'],kind='line', ylim=(0.,1.0),xlim=(0.,1.))









    Out[373]:





<matplotlib.axes._subplots.AxesSubplot at 0x12dba1210>



In [391]:

    
y_hat_probability = GBC_Tree.predict_proba(X_test).T[1] 
print(y_hat_probability)
print(roc_auc_score(y_test, y_hat_probability))
vals = roc_curve(y_test, y_hat_probability)  
Roc_DataFrame = pd.DataFrame({'False_Positive_Rate':vals[0],'True_Positive_Rate':vals[1]})
Roc_DataFrame.plot(x = 'False_Positive_Rate' , y = 'True_Positive_Rate' )









    



[  1.00857726e-05   2.10989167e-05   4.64425388e-04 ...,   3.22592500e-04
   4.09920987e-04   1.23469160e-04]
0.928948512365






    Out[391]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a947ad0>



In [ ]:

    
from sklearn.ensemble import GradientBoostingClassifier
GBC_Tree = GradientBoostingClassifier(learning_rate = 0.01,
                                      n_estimators = 10000,
                                      max_depth = 2,
                                      min_samples_leaf = 5)
 

from sklearn import cross_validation
kf = cross_validation.KFold(len(df), n_folds = 10, shuffle = True) #10 fold CV
scores = []
for train_index, test_index in kf:        
        GBC_Tree.fit(X.iloc[train_index], y.iloc[train_index])
        y_hat_test = GBC_Tree.predict_probadi(X.iloc[test_index])
        y_hat_predict_threshold.append(PredictThreshold(y_hat_probability[i],0.05))
        scores.append(float(sum(y_hat_test == y.iloc[test_index]))/len(y_hat_test))

Score_GBC_CV = np.mean(scores)    

print(Score_GBC_CV)



In [428]:

    
drames = pd.read_csv("data/all_stops_in_pa.csv",sep=',',error_bad_lines=False)



In [429]:

    
dat = []
for i in drames.values:
    if i[0] == 101: pass
    else:
        pred = get_train_prediction(i[0])
        s=pd.Series({'id':i[0],'time':i[1],'dir':i[2],'pred':pred})
        dat.append(s)
d = pd.DataFrame(dat)









    



[102 '05:53' 1]
Will be delayed: 1
Probability: [[ 0.9902  0.0098]]

[104 '06:23' 1]
Will be delayed: 0
Probability: [[ 0.4003  0.5997]]

[206 '06:59' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[208 '07:20' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[210 '07:28' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[312 '07:38' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[314 '07:53' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[216 '08:05' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[218 '08:20' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[220 '08:28' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[322 '08:38' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[324 '08:53' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[226 '09:05' 1]
Will be delayed: 0
Probability: [[ 0.0038  0.9962]]

[228 '09:20' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[230 '09:28' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[332 '09:38' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[134 '10:05' 1]
Will be delayed: 0
Probability: [[ 0.431  0.569]]

[236 '10:27' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[138 '11:05' 1]
Will be delayed: 0
Probability: [[ 0.0266  0.9734]]

[142 '12:05' 1]
Will be delayed: 0
Probability: [[ 0.0058  0.9942]]

[146 '13:05' 1]
Will be delayed: 0
Probability: [[ 0.4201  0.5799]]

[150 '14:05' 1]
Will be delayed: 0
Probability: [[ 0.495  0.505]]

[152 '15:05' 1]
Will be delayed: 0
Probability: [[ 0.7832  0.2168]]

[254 '15:27' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[156 '16:05' 1]
Will be delayed: 0
Probability: [[ 0.9425  0.0575]]

[258 '16:27' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[360 '16:46' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[262 '17:03' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[264 '17:10' 0]
Will be delayed: 0
Probability: [[  6.00000000e-04   9.99400000e-01]]

[366 '17:14' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[268 '17:40' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[370 '17:51' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[272 '18:04' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[274 '18:10' 0]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[376 '18:14' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[278 '18:40' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[380 '18:51' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[282 '19:04' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[284 '19:10' 0]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[386 '19:14' 1]
Will be delayed: 0
Probability: [[  3.00000000e-04   9.99700000e-01]]

[288 '19:40' 1]
Will be delayed: 0
Probability: [[  7.00000000e-04   9.99300000e-01]]

[190 '20:28' 1]
Will be delayed: 1
Probability: [[ 0.9902  0.0098]]

[192 '21:38' 1]
Will be delayed: 0
Probability: [[ 0.6558  0.3442]]

[194 '22:38' 1]
Will be delayed: 0
Probability: [[ 0.9223  0.0777]]

[196 '23:38' 1]
Will be delayed: 0
Probability: [[ 0.9715  0.0285]]

[103 '05:36' 1]
Will be delayed: 1
Probability: [[ 0.9837  0.0163]]

[305 '06:05' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[207 '06:36' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[309 '06:23' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[211 '06:40' 0]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[313 '07:05' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[215 '07:16' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[217 '07:36' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[319 '07:23' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[221 '07:40' 0]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[323 '08:05' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[225 '08:16' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[227 '08:36' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[329 '08:25' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[231 '08:50' 0]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[233 '09:14' 1]
Will be delayed: 0
Probability: [[  8.00000000e-04   9.99200000e-01]]

[135 '09:46' 1]
Will be delayed: 0
Probability: [[ 0.0014  0.9986]]

[237 '10:21' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[139 '10:46' 1]
Will be delayed: 0
Probability: [[ 0.2204  0.7796]]

[143 '11:41' 1]
Will be delayed: 0
Probability: [[ 0.7085  0.2915]]

[147 '12:41' 1]
Will be delayed: 0
Probability: [[ 0.3988  0.6012]]

[151 '13:41' 1]
Will be delayed: 0
Probability: [[ 0.0423  0.9577]]

[155 '14:46' 1]
Will be delayed: 0
Probability: [[ 0.5565  0.4435]]

[257 '15:11' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[159 '15:38' 1]
Will be delayed: 0
Probability: [[ 0.9267  0.0733]]

[261 '16:16' 1]
Will be delayed: 0
Probability: [[ 0.002  0.998]]

[263 '16:24' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[365 '16:43' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[267 '16:54' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[269 '17:16' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[371 '17:06' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[273 '17:26' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[375 '17:44' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[277 '17:54' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[279 '18:16' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[381 '18:06' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[283 '18:24' 1]
Will be delayed: 0
Probability: [[  5.00000000e-04   9.99500000e-01]]

[385 '18:43' 1]
Will be delayed: 0
Probability: [[ 0.  1.]]

[287 '18:54' 1]
Will be delayed: 0
Probability: [[ 0.003  0.997]]

[289 '19:10' 1]
Will be delayed: 0
Probability: [[  4.00000000e-04   9.99600000e-01]]

[191 '19:21' 1]
Will be delayed: 0
Probability: [[ 0.685  0.315]]

[193 '20:01' 1]
Will be delayed: 0
Probability: [[ 0.6495  0.3505]]

[195 '21:01' 1]
Will be delayed: 1
Probability: [[ 0.9837  0.0163]]

[197 '22:01' 1]
Will be delayed: 0
Probability: [[ 0.9301  0.0699]]

[199 '23:01' 1]
Will be delayed: 0
Probability: [[ 0.9417  0.0583]]



In [424]:









    Out[424]:





dir                                              1
id                                             196
pred    [0, [[0.971472829122], [0.0285271708779]]]
time                                         23:38
dtype: object



In [431]:

    
prob = pd.DataFrame(dat)



In [438]:

    
prob['pred'][0]









    Out[438]:





list



In [450]:

    
def ref(t):
    l = list(t)
    return pd.Series([l[0],l[1][0][0],l[1][1][0]])



In [454]:

    
prob[['is_delay','prob_delay','prob_nodelay']] = prob['pred'].apply(lambda x:ref(x))



In [456]:

    
del prob['pred']



In [474]:

    
prob.set_index('time')

prob.sort_index(inplace=True)
prob.sort('time').plot.area(x='time',y=['is_delay','prob_delay',],figsize=(15,5),stacked=False)









    



/Users/albarron/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:4: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)






    Out[474]:





<matplotlib.axes._subplots.AxesSubplot at 0x127314dd0>



In [466]:

    
prob.sort('time')









    



/Users/albarron/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  if __name__ == '__main__':






    Out[466]:






  
    
      
      dir
      id
      time
      is_delay
      prob_delay
      prob_nodelay
    
  
  
    
      45
      1
      103
      05:36
      1
      0.983692
      0.016308
    
    
      0
      1
      102
      05:53
      1
      0.990210
      0.009790
    
    
      46
      1
      305
      06:05
      0
      0.000039
      0.999961
    
    
      1
      1
      104
      06:23
      0
      0.400328
      0.599672
    
    
      48
      1
      309
      06:23
      0
      0.000039
      0.999961
    
    
      47
      1
      207
      06:36
      0
      0.000396
      0.999604
    
    
      49
      0
      211
      06:40
      0
      0.000396
      0.999604
    
    
      2
      1
      206
      06:59
      0
      0.000680
      0.999320
    
    
      50
      1
      313
      07:05
      0
      0.000039
      0.999961
    
    
      51
      1
      215
      07:16
      0
      0.000396
      0.999604
    
    
      3
      1
      208
      07:20
      0
      0.000680
      0.999320
    
    
      53
      1
      319
      07:23
      0
      0.000039
      0.999961
    
    
      4
      1
      210
      07:28
      0
      0.000680
      0.999320
    
    
      52
      1
      217
      07:36
      0
      0.000396
      0.999604
    
    
      5
      1
      312
      07:38
      0
      0.000349
      0.999651
    
    
      54
      0
      221
      07:40
      0
      0.000396
      0.999604
    
    
      6
      1
      314
      07:53
      0
      0.000349
      0.999651
    
    
      7
      1
      216
      08:05
      0
      0.000680
      0.999320
    
    
      55
      1
      323
      08:05
      0
      0.000039
      0.999961
    
    
      56
      1
      225
      08:16
      0
      0.000396
      0.999604
    
    
      8
      1
      218
      08:20
      0
      0.000680
      0.999320
    
    
      58
      1
      329
      08:25
      0
      0.000039
      0.999961
    
    
      9
      1
      220
      08:28
      0
      0.000680
      0.999320
    
    
      57
      1
      227
      08:36
      0
      0.000396
      0.999604
    
    
      10
      1
      322
      08:38
      0
      0.000349
      0.999651
    
    
      59
      0
      231
      08:50
      0
      0.000396
      0.999604
    
    
      11
      1
      324
      08:53
      0
      0.000349
      0.999651
    
    
      12
      1
      226
      09:05
      0
      0.003807
      0.996193
    
    
      60
      1
      233
      09:14
      0
      0.000752
      0.999248
    
    
      13
      1
      228
      09:20
      0
      0.000680
      0.999320
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      74
      1
      269
      17:16
      0
      0.000396
      0.999604
    
    
      76
      1
      273
      17:26
      0
      0.000396
      0.999604
    
    
      30
      1
      268
      17:40
      0
      0.000680
      0.999320
    
    
      77
      1
      375
      17:44
      0
      0.000039
      0.999961
    
    
      31
      1
      370
      17:51
      0
      0.000349
      0.999651
    
    
      78
      1
      277
      17:54
      0
      0.000396
      0.999604
    
    
      32
      1
      272
      18:04
      0
      0.000680
      0.999320
    
    
      80
      1
      381
      18:06
      0
      0.000039
      0.999961
    
    
      33
      0
      274
      18:10
      0
      0.000680
      0.999320
    
    
      34
      1
      376
      18:14
      0
      0.000349
      0.999651
    
    
      79
      1
      279
      18:16
      0
      0.000396
      0.999604
    
    
      81
      1
      283
      18:24
      0
      0.000548
      0.999452
    
    
      35
      1
      278
      18:40
      0
      0.000680
      0.999320
    
    
      82
      1
      385
      18:43
      0
      0.000039
      0.999961
    
    
      36
      1
      380
      18:51
      0
      0.000349
      0.999651
    
    
      83
      1
      287
      18:54
      0
      0.002979
      0.997021
    
    
      37
      1
      282
      19:04
      0
      0.000680
      0.999320
    
    
      38
      0
      284
      19:10
      0
      0.000680
      0.999320
    
    
      84
      1
      289
      19:10
      0
      0.000396
      0.999604
    
    
      39
      1
      386
      19:14
      0
      0.000349
      0.999651
    
    
      85
      1
      191
      19:21
      0
      0.684970
      0.315030
    
    
      40
      1
      288
      19:40
      0
      0.000740
      0.999260
    
    
      86
      1
      193
      20:01
      0
      0.649527
      0.350473
    
    
      41
      1
      190
      20:28
      1
      0.990210
      0.009790
    
    
      87
      1
      195
      21:01
      1
      0.983692
      0.016308
    
    
      42
      1
      192
      21:38
      0
      0.655791
      0.344209
    
    
      88
      1
      197
      22:01
      0
      0.930077
      0.069923
    
    
      43
      1
      194
      22:38
      0
      0.922313
      0.077687
    
    
      89
      1
      199
      23:01
      0
      0.941684
      0.058316
    
    
      44
      1
      196
      23:38
      0
      0.971473
      0.028527
    
  

90 rows × 6 columns



In [ ]:

	Depth	cv_scores
0	1	0.989345
1	2	0.992959
2	3	0.992389
3	4	0.992389
4	5	0.992769

	Feature	Value
0	tid_139.0	0.8279
1	tid_221.0	0.8279
2	tid_231.0	0.8279
3	tid_257.0	0.8279
4	tid_274.0	0.8279

	Gini_Import	feature
7	0.521470	is_limited
6	0.293549	is_bullet
0	0.051288	temp
5	0.033127	cloudcover
4	0.032764	humidity
3	0.016853	windspeed
10	0.010351	d_wednesday
2	0.008541	visability
17	0.006534	tid_135.0
12	0.005773	d_friday

	temp	precipiation	visability	windspeed	humidity	cloudcover
temp	1.000000	-0.209248	0.119720	0.333041	-0.345526	-0.158742
precipiation	-0.209248	1.000000	-0.459115	0.200260	0.303568	0.365534
visability	0.119720	-0.459115	1.000000	-0.119827	-0.427823	-0.343510
windspeed	0.333041	0.200260	-0.119827	1.000000	-0.004015	0.252678
humidity	-0.345526	0.303568	-0.427823	-0.004015	1.000000	0.487529
cloudcover	-0.158742	0.365534	-0.343510	0.252678	0.487529	1.000000

	temp	visability	windspeed	humidity	cloudcover	...
0	72.97875	10	6.68125	0.62875	0.09875	...
1	72.97875	10	6.68125	0.62875	0.09875	...
2	72.97875	10	6.68125	0.62875	0.09875	...
3	72.97875	10	6.68125	0.62875	0.09875	...
4	72.97875	10	6.68125	0.62875	0.09875	...

	Importance	feature
0	0.169679	temp
4	0.167738	humidity
5	0.152647	cloudcover
3	0.149600	windspeed
2	0.083236	visability
12	0.037309	d_friday
7	0.028038	is_limited
9	0.025530	d_tuesday
1	0.021733	precipiation
11	0.019582	d_thursday

	dir	id	time	is_delay	prob_delay	prob_nodelay
45	1	103	05:36	1	0.983692	0.016308
0	1	102	05:53	1	0.990210	0.009790
46	1	305	06:05	0	0.000039	0.999961
1	1	104	06:23	0	0.400328	0.599672
48	1	309	06:23	0	0.000039	0.999961
47	1	207	06:36	0	0.000396	0.999604
49	0	211	06:40	0	0.000396	0.999604
2	1	206	06:59	0	0.000680	0.999320
50	1	313	07:05	0	0.000039	0.999961
51	1	215	07:16	0	0.000396	0.999604
3	1	208	07:20	0	0.000680	0.999320
53	1	319	07:23	0	0.000039	0.999961
4	1	210	07:28	0	0.000680	0.999320
52	1	217	07:36	0	0.000396	0.999604
5	1	312	07:38	0	0.000349	0.999651
54	0	221	07:40	0	0.000396	0.999604
6	1	314	07:53	0	0.000349	0.999651
7	1	216	08:05	0	0.000680	0.999320
55	1	323	08:05	0	0.000039	0.999961
56	1	225	08:16	0	0.000396	0.999604
8	1	218	08:20	0	0.000680	0.999320
58	1	329	08:25	0	0.000039	0.999961
9	1	220	08:28	0	0.000680	0.999320
57	1	227	08:36	0	0.000396	0.999604
10	1	322	08:38	0	0.000349	0.999651
59	0	231	08:50	0	0.000396	0.999604
11	1	324	08:53	0	0.000349	0.999651
12	1	226	09:05	0	0.003807	0.996193
60	1	233	09:14	0	0.000752	0.999248
13	1	228	09:20	0	0.000680	0.999320
...	...	...	...	...	...	...
74	1	269	17:16	0	0.000396	0.999604
76	1	273	17:26	0	0.000396	0.999604
30	1	268	17:40	0	0.000680	0.999320
77	1	375	17:44	0	0.000039	0.999961
31	1	370	17:51	0	0.000349	0.999651
78	1	277	17:54	0	0.000396	0.999604
32	1	272	18:04	0	0.000680	0.999320
80	1	381	18:06	0	0.000039	0.999961
33	0	274	18:10	0	0.000680	0.999320
34	1	376	18:14	0	0.000349	0.999651
79	1	279	18:16	0	0.000396	0.999604
81	1	283	18:24	0	0.000548	0.999452
35	1	278	18:40	0	0.000680	0.999320
82	1	385	18:43	0	0.000039	0.999961
36	1	380	18:51	0	0.000349	0.999651
83	1	287	18:54	0	0.002979	0.997021
37	1	282	19:04	0	0.000680	0.999320
38	0	284	19:10	0	0.000680	0.999320
84	1	289	19:10	0	0.000396	0.999604
39	1	386	19:14	0	0.000349	0.999651
85	1	191	19:21	0	0.684970	0.315030
40	1	288	19:40	0	0.000740	0.999260
86	1	193	20:01	0	0.649527	0.350473
41	1	190	20:28	1	0.990210	0.009790
87	1	195	21:01	1	0.983692	0.016308
42	1	192	21:38	0	0.655791	0.344209
88	1	197	22:01	0	0.930077	0.069923
43	1	194	22:38	0	0.922313	0.077687
89	1	199	23:01	0	0.941684	0.058316
44	1	196	23:38	0	0.971473	0.028527