Algorithms Exploration

Here we'll be testing a few different supervised learning algorithms against the dataset we developed so far


In [1]:
#Importing the dataset

import pandas as pd
startups = pd.read_csv('data/startups_3.csv', index_col=0)
startups[:3]

X = startups.drop('acquired', 1)
X_numeric = X.filter(regex=('(number_of|avg_).*|.*(funding_total_usd|funding_rounds|_at)'))
X_categorical = X.filter(regex=('(Category_|state_).*'))
X_state = X.filter(regex=('(state_).*'))
X_category = X.filter(regex=('(Category_).*'))

y = startups['acquired']

In [9]:
startups_not_operating = pd.read_csv('data/startups_not_operating_3.csv', index_col=0)
X = startups_not_operating.drop('acquired', 1)
y = startups_not_operating['acquired']

In [7]:
from sklearn.decomposition import PCA
pca = PCA(n_components=6)
pca.fit(X_numeric)
X_pca =  pca.transform(X_numeric)
X_pca = pd.DataFrame(X_pca)
X_pca[:3]


Out[7]:
0 1 2 3 4 5
0 -0.057245 0.120264 -0.010556 -0.018942 -0.032121 -0.011784
1 -0.087023 -0.012353 -0.062867 -0.050250 0.004071 -0.002986
2 0.191721 0.115093 0.004067 0.007415 0.031256 -0.055648

In [7]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42, return_indices=True)
X_undersampled, y_undersampled, indices = rus.fit_sample(X, y)

In [6]:
X_software = X[X['Category_Software'] == 1]
y_software = y.loc[X_software.index]

In [9]:
y_software.shape


Out[9]:
(5492L,)

In [13]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn import grid_search

def run_classifier(parameters, classifier, stratify=True, new_X=None, new_y=None):
    X_train, X_test, y_train, y_test = train_test_split(new_X if new_X is not None else X, new_y if new_y is not None else y, test_size=0.2, random_state=42, stratify = y if stratify else None)
    clf = grid_search.GridSearchCV(classifier, parameters, n_jobs=4, scoring='roc_auc')
    clf.fit(X=X_train, y=y_train)
    model = clf.best_estimator_
    print (clf.best_score_, clf.best_params_) 
    print roc_auc_score(y_test, model.predict(X_test))
    print pd.crosstab(y_test, model.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)
    return model


C:\Users\Fernando\Anaconda2\lib\site-packages\sklearn\grid_search.py:43: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

In [3]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = run_classifier({'max_depth':range(5,20)}, DecisionTreeClassifier())


(0.8356516491843838, {'max_depth': 6})
0.526351092476
Predicted     0    1   All
True                      
0          6632   58  6690
1           780   51   831
All        7412  109  7521

In [ ]:


In [41]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = run_classifier({'max_depth':range(5,10), 'n_estimators':[50], 'class_weight':['balanced']}, RandomForestClassifier(random_state=0))


(0.85238623633237032, {'n_estimators': 50, 'max_depth': 9, 'class_weight': 'balanced'})
0.779468251013
Predicted     0     1   All
True                       
0          5446  1244  6690
1           212   619   831
All        5658  1863  7521

In [27]:
#Subsampled
from sklearn.ensemble import RandomForestClassifier

rf_clf = run_classifier({'max_depth':range(6,10), 'n_estimators':[50], 'class_weight':['balanced_subsample']}, RandomForestClassifier(random_state=0))


(0.85249587347790967, {'n_estimators': 50, 'max_depth': 9, 'class_weight': 'balanced_subsample'})
0.782035798892
Predicted     0     1   All
True                       
0          5424  1266  6690
1           205   626   831
All        5629  1892  7521

In [17]:
from sklearn.svm import SVC

#parameters = {'kernel':['linear', 'rbf', 'poly'], 'C':[1, 10, 100, 1000], 'class_weight':['balanced']}
parameters = {'kernel':['rbf'], 'C':[100], 'class_weight':['balanced']}
svc_clf = run_classifier(parameters, SVC(random_state=0))


(0.82797711356597792, {'kernel': 'rbf', 'C': 100, 'class_weight': 'balanced'})
0.754569026458
Predicted     0     1   All
True                       
0          5137  1553  6690
1           215   616   831
All        5352  2169  7521

In [18]:
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y)
#clf = grid_search.GridSearchCV(classifier, parameters, n_jobs=4, scoring='roc_auc')
clf = SVC(C=1, kernel='rbf', class_weight={0:1, 1:8})
clf.fit(X=X_train, y=y_train)
    #model = clf.best_estimator_
    #print (clf.best_score_, clf.best_params_) 
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)


0.725930646088
0.726810045706
Predicted     0     1   All
True                       
0          4991  1699  6690
1           243   588   831
All        5234  2287  7521

In [19]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier

parameters = {'n_neighbors':[3, 5]}
knn_clf = run_classifier(parameters, KNeighborsClassifier(), stratify=False, new_X=X_undersampled, new_y=y_undersampled)


(0.71566986959496537, {'n_neighbors': 5})
0.675567158387
Predicted    0    1   All
True                     
0          573  286   859
1          254  550   804
All        827  836  1663

In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.cross_validation import cross_val_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = BernoulliNB()
print cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=5)
clf.fit(X_train, y_train)
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)


[ 0.75196387  0.77663833  0.74250558  0.73572897  0.7446668 ]
0.666785352467
0.659672554003
Predicted     0     1   All
True                       
0          5574  1116  6690
1           427   404   831
All        6001  1520  7521

In [ ]:


In [23]:
from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = SGDClassifier(random_state=0, class_weight='balanced', loss='log')
print cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=10)
clf.fit(X_train, y_train)
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)


[ 0.81046854  0.81351643  0.83313246  0.83169492  0.81887942  0.81859137
  0.81792929  0.80912285  0.81394663  0.81957325]
0.724380124297
0.719264973315
Predicted     0     1   All
True                       
0          5679  1011  6690
1           341   490   831
All        6020  1501  7521

In [243]:
from sklearn.linear_model import SGDClassifier
parameters={'l1_ratio':[0.10, 0.15, 0.20], 'alpha':[0.001, 0.0001, 0.00001, 0.000001], 'class_weight':['balanced'], 'random_state':[0], 'loss':['hinge', 'log'], 'penalty':['l2', 'l1', 'elasticnet']}
sgd_clf = run_classifier(parameters, SGDClassifier(), stratify=True, new_X=X_numeric)


(0.79927523380599907, {'loss': 'log', 'l1_ratio': 0.1, 'penalty': 'l1', 'random_state': 0, 'alpha': 0.001, 'class_weight': 'balanced'})
0.718815193753
Predicted     0     1   All
True                       
0          5198  1492  6690
1           282   549   831
All        5480  2041  7521

In [13]:
#SGD for Category Software
from sklearn.linear_model import SGDClassifier
#parameters={'l1_ratio':[0.10, 0.15, 0.20], 'alpha':[0.001, 0.0001, 0.00001, 0.000001], 'class_weight':['balanced'], 'random_state':[0], 'loss':['hinge', 'log'], 'penalty':['l2', 'l1', 'elasticnet']}
parameters={'l1_ratio':[0.10], 'alpha':[0.001], 'class_weight':['balanced'], 'random_state':[0], 'loss':['log'], 'penalty':['l1']}
sgd_clf = run_classifier(parameters, SGDClassifier())


(0.8271532874398471, {'loss': 'log', 'l1_ratio': 0.1, 'penalty': 'l1', 'random_state': 0, 'alpha': 0.001, 'class_weight': 'balanced'})
0.755024202296
Predicted     0     1   All
True                       
0          5288  1402  6690
1           233   598   831
All        5521  2000  7521

In [231]:
from sklearn.linear_model import LogisticRegression
parameters={}
lr_clf = run_classifier(parameters, LogisticRegression(), stratify=True)


(0.80857442131721124, {})
0.525906439376
Predicted     0    1   All
True                      
0          6618   72  6690
1           779   52   831
All        7397  124  7521

In [ ]:


In [ ]:
next: ignore "ipo" and "closed", work only with opearting and acquired

In [21]:
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = XGBClassifier(random_state=0, objective='multi:softmax', num_class=2)
#print cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=3)
clf.fit(X_train, y_train, sample_weight=y_train.replace({1:8,0:1}))
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)


0.801449128334
0.780187034909
Predicted     0     1   All
True                       
0          5198  1492  6690
1           180   651   831
All        5378  2143  7521

In [22]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics   #Additional scklearn functions
from sklearn.grid_search import GridSearchCV   #Perforing grid search

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [84]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['acquired'],eval_metric='auc', sample_weight=dtrain['acquired'].replace({1:4,0:1}))
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['acquired'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['acquired'], dtrain_predprob)
    print pd.crosstab(dtrain['acquired'], dtrain_predictions, rownames=['True'], colnames=['Predicted'], margins=True)
    print "AUC Score (Test): %f" % metrics.roc_auc_score(y_test, alg.predict_proba(X_test)[:,1])
    print pd.crosstab(y_test, alg.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)
                   
    #feat_imp = pd.Series(alg.Booster().get_fscore()).sort_values(ascending=False)
    #feat_imp.plot(kind='bar', title='Feature Importances')
    #plt.ylabel('Feature Importance Score')

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
train = X_train.join(y_train)

In [58]:
#train = startups
target = 'acquired'

predictors = [x for x in train.columns if x not in [target]]

In [85]:
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=2,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

modelfit(XGBClassifier(), train, predictors)


C:\Users\Fernando\Anaconda2\lib\site-packages\xgboost-0.6-py2.7.egg\xgboost\sklearn.py:161: DeprecationWarning: The seed parameter is deprecated as of version .6.Please use random_state instead.seed is deprecated.
  'seed is deprecated.', DeprecationWarning)
C:\Users\Fernando\Anaconda2\lib\site-packages\xgboost-0.6-py2.7.egg\xgboost\sklearn.py:171: DeprecationWarning: The nthread parameter is deprecated as of version .6.Please use n_jobs instead.nthread is deprecated.
  'nthread is deprecated.', DeprecationWarning)
Model Report
Accuracy : 0.8443
AUC Score (Train): 0.877806
Predicted      0     1    All
True                         
0          23081  3674  26755
1           1010  2315   3325
All        24091  5989  30080
AUC Score (Test): 0.855004
Predicted     0     1   All
True                       
0          5758   932  6690
1           275   556   831
All        6033  1488  7521

In [ ]: