In [1]:
#Importing the dataset
import pandas as pd
startups = pd.read_csv('data/startups_3.csv', index_col=0)
startups[:3]
X = startups.drop('acquired', 1)
X_numeric = X.filter(regex=('(number_of|avg_).*|.*(funding_total_usd|funding_rounds|_at)'))
X_categorical = X.filter(regex=('(Category_|state_).*'))
X_state = X.filter(regex=('(state_).*'))
X_category = X.filter(regex=('(Category_).*'))
y = startups['acquired']
In [9]:
startups_not_operating = pd.read_csv('data/startups_not_operating_3.csv', index_col=0)
X = startups_not_operating.drop('acquired', 1)
y = startups_not_operating['acquired']
In [7]:
from sklearn.decomposition import PCA
pca = PCA(n_components=6)
pca.fit(X_numeric)
X_pca = pca.transform(X_numeric)
X_pca = pd.DataFrame(X_pca)
X_pca[:3]
Out[7]:
In [7]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42, return_indices=True)
X_undersampled, y_undersampled, indices = rus.fit_sample(X, y)
In [6]:
X_software = X[X['Category_Software'] == 1]
y_software = y.loc[X_software.index]
In [9]:
y_software.shape
Out[9]:
In [13]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn import grid_search
def run_classifier(parameters, classifier, stratify=True, new_X=None, new_y=None):
X_train, X_test, y_train, y_test = train_test_split(new_X if new_X is not None else X, new_y if new_y is not None else y, test_size=0.2, random_state=42, stratify = y if stratify else None)
clf = grid_search.GridSearchCV(classifier, parameters, n_jobs=4, scoring='roc_auc')
clf.fit(X=X_train, y=y_train)
model = clf.best_estimator_
print (clf.best_score_, clf.best_params_)
print roc_auc_score(y_test, model.predict(X_test))
print pd.crosstab(y_test, model.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)
return model
In [3]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = run_classifier({'max_depth':range(5,20)}, DecisionTreeClassifier())
In [ ]:
In [41]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = run_classifier({'max_depth':range(5,10), 'n_estimators':[50], 'class_weight':['balanced']}, RandomForestClassifier(random_state=0))
In [27]:
#Subsampled
from sklearn.ensemble import RandomForestClassifier
rf_clf = run_classifier({'max_depth':range(6,10), 'n_estimators':[50], 'class_weight':['balanced_subsample']}, RandomForestClassifier(random_state=0))
In [17]:
from sklearn.svm import SVC
#parameters = {'kernel':['linear', 'rbf', 'poly'], 'C':[1, 10, 100, 1000], 'class_weight':['balanced']}
parameters = {'kernel':['rbf'], 'C':[100], 'class_weight':['balanced']}
svc_clf = run_classifier(parameters, SVC(random_state=0))
In [18]:
from sklearn.svm import SVC
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42, stratify=y)
#clf = grid_search.GridSearchCV(classifier, parameters, n_jobs=4, scoring='roc_auc')
clf = SVC(C=1, kernel='rbf', class_weight={0:1, 1:8})
clf.fit(X=X_train, y=y_train)
#model = clf.best_estimator_
#print (clf.best_score_, clf.best_params_)
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)
In [19]:
import warnings
warnings.filterwarnings('ignore')
from sklearn.neighbors import KNeighborsClassifier
parameters = {'n_neighbors':[3, 5]}
knn_clf = run_classifier(parameters, KNeighborsClassifier(), stratify=False, new_X=X_undersampled, new_y=y_undersampled)
In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.cross_validation import cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = BernoulliNB()
print cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=5)
clf.fit(X_train, y_train)
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)
In [ ]:
In [23]:
from sklearn.linear_model import SGDClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = SGDClassifier(random_state=0, class_weight='balanced', loss='log')
print cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=10)
clf.fit(X_train, y_train)
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)
In [243]:
from sklearn.linear_model import SGDClassifier
parameters={'l1_ratio':[0.10, 0.15, 0.20], 'alpha':[0.001, 0.0001, 0.00001, 0.000001], 'class_weight':['balanced'], 'random_state':[0], 'loss':['hinge', 'log'], 'penalty':['l2', 'l1', 'elasticnet']}
sgd_clf = run_classifier(parameters, SGDClassifier(), stratify=True, new_X=X_numeric)
In [13]:
#SGD for Category Software
from sklearn.linear_model import SGDClassifier
#parameters={'l1_ratio':[0.10, 0.15, 0.20], 'alpha':[0.001, 0.0001, 0.00001, 0.000001], 'class_weight':['balanced'], 'random_state':[0], 'loss':['hinge', 'log'], 'penalty':['l2', 'l1', 'elasticnet']}
parameters={'l1_ratio':[0.10], 'alpha':[0.001], 'class_weight':['balanced'], 'random_state':[0], 'loss':['log'], 'penalty':['l1']}
sgd_clf = run_classifier(parameters, SGDClassifier())
In [231]:
from sklearn.linear_model import LogisticRegression
parameters={}
lr_clf = run_classifier(parameters, LogisticRegression(), stratify=True)
In [ ]:
In [ ]:
next: ignore "ipo" and "closed", work only with opearting and acquired
In [21]:
from xgboost import XGBClassifier
from sklearn.cross_validation import cross_val_score
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
clf = XGBClassifier(random_state=0, objective='multi:softmax', num_class=2)
#print cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=3)
clf.fit(X_train, y_train, sample_weight=y_train.replace({1:8,0:1}))
print roc_auc_score(y_train, clf.predict(X_train))
print roc_auc_score(y_test, clf.predict(X_test))
print pd.crosstab(y_test, clf.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)
In [22]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics #Additional scklearn functions
from sklearn.grid_search import GridSearchCV #Perforing grid search
import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
In [84]:
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain['acquired'],eval_metric='auc', sample_weight=dtrain['acquired'].replace({1:4,0:1}))
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Print model report:
print "\nModel Report"
print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['acquired'].values, dtrain_predictions)
print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['acquired'], dtrain_predprob)
print pd.crosstab(dtrain['acquired'], dtrain_predictions, rownames=['True'], colnames=['Predicted'], margins=True)
print "AUC Score (Test): %f" % metrics.roc_auc_score(y_test, alg.predict_proba(X_test)[:,1])
print pd.crosstab(y_test, alg.predict(X_test), rownames=['True'], colnames=['Predicted'], margins=True)
#feat_imp = pd.Series(alg.Booster().get_fscore()).sort_values(ascending=False)
#feat_imp.plot(kind='bar', title='Feature Importances')
#plt.ylabel('Feature Importance Score')
In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
train = X_train.join(y_train)
In [58]:
#train = startups
target = 'acquired'
predictors = [x for x in train.columns if x not in [target]]
In [85]:
xgb1 = XGBClassifier(
learning_rate =0.1,
n_estimators=2,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
modelfit(XGBClassifier(), train, predictors)
In [ ]: