Algorithms Exploration

Here we'll be testing a few different supervised learning algorithms against the dataset we developed so far


In [40]:
#Importing the dataset

import pandas as pd
startups = pd.read_csv('data/startups_3.csv', index_col=0)
startups[:3]

X = startups.drop('acquired', 1)
X_numeric = X.filter(regex=('(number_of|avg_).*|.*(funding_total_usd|funding_rounds|_at)'))
X_categorical = X.filter(regex=('(Category_|state_).*'))
X_state = X.filter(regex=('(state_).*'))
X_category = X.filter(regex=('(Category_).*'))

y = startups['acquired']

In [33]:
#startups_not_operating = pd.read_csv('data/startups_not_operating_3.csv', index_col=0)
#X = startups_not_operating.drop('acquired', 1)
#y = startups_not_operating['acquired']

In [25]:
from sklearn.decomposition import PCA
pca = PCA(n_components=6)
pca.fit(X_numeric)
X_pca =  pca.transform(X_numeric)
X_pca = pd.DataFrame(X_pca)
X_pca[:3]


Out[25]:
0 1 2 3 4 5
0 -0.057245 0.120264 -0.010556 -0.018942 -0.032121 -0.011784
1 -0.087023 -0.012353 -0.062867 -0.050250 0.004071 -0.002986
2 0.191721 0.115093 0.004067 0.007415 0.031256 -0.055648

In [7]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42, return_indices=True)
X_undersampled, y_undersampled, indices = rus.fit_sample(X, y)

In [8]:
X_software = X[X['Category_Software'] == 1]
y_software = y.loc[X_software.index]

In [9]:
y_software.shape


Out[9]:
(5492L,)

In [10]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn import grid_search

def run_classifier(parameters, classifier, stratify=True, new_X=None, new_y=None):
    X_train, X_test, y_train, y_test = train_test_split(new_X if new_X is not None else X, new_y if new_y is not None else y, test_size=0.2, random_state=42, stratify = y if stratify else None)
    clf = grid_search.GridSearchCV(classifier, parameters, n_jobs=4, scoring='roc_auc')
    clf.fit(X=X_train, y=y_train)
    model = clf.best_estimator_
    print (clf.best_score_, clf.best_params_) 
    print roc_auc_score(y_test, model.predict(X_test))
    print pd.crosstab(y_test, model.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)
    return model

In [12]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = run_classifier({'max_depth':range(5,20)}, DecisionTreeClassifier())


(0.83557555703535935, {'max_depth': 6})
0.526351092476
Predicted     0    1   All
True                      
0          6632   58  6690
1           780   51   831
All        7412  109  7521

In [41]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = run_classifier({'max_depth':range(5,10), 'n_estimators':[50], 'class_weight':['balanced']}, RandomForestClassifier(random_state=0))


(0.85238623633237032, {'n_estimators': 50, 'max_depth': 9, 'class_weight': 'balanced'})
0.779468251013
Predicted     0     1   All
True                       
0          5446  1244  6690
1           212   619   831
All        5658  1863  7521

In [27]:
#Subsampled
from sklearn.ensemble import RandomForestClassifier

rf_clf = run_classifier({'max_depth':range(6,10), 'n_estimators':[50], 'class_weight':['balanced_subsample']}, RandomForestClassifier(random_state=0))


(0.85249587347790967, {'n_estimators': 50, 'max_depth': 9, 'class_weight': 'balanced_subsample'})
0.782035798892
Predicted     0     1   All
True                       
0          5424  1266  6690
1           205   626   831
All        5629  1892  7521

In [17]:
from sklearn.svm import SVC

#parameters = {'kernel':['linear', 'rbf', 'poly'], 'C':[1, 10, 100, 1000], 'class_weight':['balanced']}
parameters = {'kernel':['rbf'], 'C':[100], 'class_weight':['balanced']}
svc_clf = run_classifier(parameters, SVC(random_state=0))


(0.82797711356597792, {'kernel': 'rbf', 'C': 100, 'class_weight': 'balanced'})
0.754569026458
Predicted     0     1   All
True                       
0          5137  1553  6690
1           215   616   831
All        5352  2169  7521

In [18]:
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y)
#clf = grid_search.GridSearchCV(classifier, parameters, n_jobs=4, scoring='roc_auc')
clf = SVC(C=1, kernel='rbf', class_weight={0:1, 1:8})
clf.fit(X=X_train, y=y_train)
    #model = clf.best_estimator_
    #print (clf.best_score_, clf.best_params_) 
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)


0.725930646088
0.726810045706
Predicted     0     1   All
True                       
0          4991  1699  6690
1           243   588   831
All        5234  2287  7521

In [19]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.neighbors import KNeighborsClassifier

parameters = {'n_neighbors':[3, 5]}
knn_clf = run_classifier(parameters, KNeighborsClassifier(), stratify=False, new_X=X_undersampled, new_y=y_undersampled)


(0.71566986959496537, {'n_neighbors': 5})
0.675567158387
Predicted    0    1   All
True                     
0          573  286   859
1          254  550   804
All        827  836  1663

In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.cross_validation import cross_val_score


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = BernoulliNB()
print cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=5)
clf.fit(X_train, y_train)
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)


[ 0.75196387  0.77663833  0.74250558  0.73572897  0.7446668 ]
0.666785352467
0.659672554003
Predicted     0     1   All
True                       
0          5574  1116  6690
1           427   404   831
All        6001  1520  7521

In [ ]:


In [23]:
from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = SGDClassifier(random_state=0, class_weight='balanced', loss='log')
print cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=10)
clf.fit(X_train, y_train)
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)


[ 0.81046854  0.81351643  0.83313246  0.83169492  0.81887942  0.81859137
  0.81792929  0.80912285  0.81394663  0.81957325]
0.724380124297
0.719264973315
Predicted     0     1   All
True                       
0          5679  1011  6690
1           341   490   831
All        6020  1501  7521

In [243]:
from sklearn.linear_model import SGDClassifier
parameters={'l1_ratio':[0.10, 0.15, 0.20], 'alpha':[0.001, 0.0001, 0.00001, 0.000001], 'class_weight':['balanced'], 'random_state':[0], 'loss':['hinge', 'log'], 'penalty':['l2', 'l1', 'elasticnet']}
sgd_clf = run_classifier(parameters, SGDClassifier(), stratify=True, new_X=X_numeric)


(0.79927523380599907, {'loss': 'log', 'l1_ratio': 0.1, 'penalty': 'l1', 'random_state': 0, 'alpha': 0.001, 'class_weight': 'balanced'})
0.718815193753
Predicted     0     1   All
True                       
0          5198  1492  6690
1           282   549   831
All        5480  2041  7521

In [274]:
#SGD for Category Software
from sklearn.linear_model import SGDClassifier
parameters={'l1_ratio':[0.10, 0.15, 0.20], 'alpha':[0.001, 0.0001, 0.00001, 0.000001], 'class_weight':['balanced'], 'random_state':[0], 'loss':['hinge', 'log'], 'penalty':['l2', 'l1', 'elasticnet']}
sgd_clf = run_classifier(parameters, SGDClassifier(), stratify=False, new_X=X_software, new_y=y_software)


(0.80426020803093701, {'loss': 'log', 'l1_ratio': 0.1, 'penalty': 'l1', 'random_state': 0, 'alpha': 0.001, 'class_weight': 'balanced'})
0.72061148662
Predicted    0    1   All
True                     
0          639  298   937
1           39  123   162
All        678  421  1099

In [231]:
from sklearn.linear_model import LogisticRegression
parameters={}
lr_clf = run_classifier(parameters, LogisticRegression(), stratify=True)


(0.80857442131721124, {})
0.525906439376
Predicted     0    1   All
True                      
0          6618   72  6690
1           779   52   831
All        7397  124  7521

In [ ]:


In [ ]:
next: ignore "ipo" and "closed", work only with opearting and acquired

In [ ]: