In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score
# Notes on installing xgboost for Python.
#
# If installing from source, after building and installing you have
# problems loading other packages it is because of the easy-install.pth
# file that the install script dumps in the python dist-packages
# directory. You will have to delete the easy-install.pth file then
# go change the installation of the xgboost egg and egg-info files in the
# python dist-packages directory from:
# /usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.egg/EGG_INFO
# to
# /usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.dist-info
# and
# /usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.egg/xgboost
# to
# /usr/local/lib/python2.7/dist-packages/xgboost
# Now python will be able to find all the packages.
#
# If installing from pip, DONT EVEN TRY, it is a nightmare.
#
# Anaconda install:
# XGBoost is not a part of the official distribution but several
# community members have created Conda packages for it. The
# most up to date package seems to be by user creditx. The following
# command will install the package:
#
# conda install -c creditx xgboost
#
In [2]:
data = pd.read_csv('data/final-combined-train-data-30percent.csv')
labels = pd.read_csv('data/sorted-train-labels.csv')
datapoly = pd.read_csv('data/final-combined-train-data-30percent-poly.csv')
In [4]:
data.head()
Out[4]:
In [5]:
labels.head()
Out[5]:
In [4]:
datapoly.head()
Out[4]:
In [3]:
# TODO: change to xgboost DMatrix
X = data.iloc[:,1:]
y = np.array(labels.iloc[:,1:] - 1)
Xpoly = datapoly.iloc[:,1:]
In [5]:
def run_cv(X,y, clf):
# Construct a kfolds object
kf = KFold(len(y),n_folds=10,shuffle=True)
y_prob = np.zeros((len(y),9))
y_pred = np.zeros(len(y))
# Iterate through folds
for train_index, test_index in kf:
print(train_index, test_index)
X_train = X.loc[train_index,:]
X_test = X.loc[test_index,:]
y_train = y[train_index]
clf.fit(X_train, y_train.flatten())
#clf.fit(X_train,y_train.ravel())
#clf.fit(X_train,y_train.ravel().tolist())
# use tolist() to stop data conversion warning because sklearn preprocessing
# does not like column vectors.
y_prob[test_index] = clf.predict_proba(X_test)
y_pred[test_index] = clf.predict(X_test)
return y_prob, y_pred
In [6]:
xgclf = xgb.XGBClassifier(n_estimators=1000, objective="multi:softmax", nthread=4)
prob, pred = run_cv(Xpoly,y,xgclf)
print("logloss: {:.4f}".format(log_loss(y, prob)))
print("accuracy: {:.4f}".format(accuracy_score(y, pred)))
cm = confusion_matrix(y, pred)
print(cm)
In [30]:
xgclf = xgb.XGBClassifier(n_estimators=1000, objective="multi:softmax", nthread=4)
prob, pred = run_cv(X,y,xgclf)
print("logloss: {:.4f}".format(log_loss(y, prob)))
print("accuracy: {:.4f}".format(accuracy_score(y, pred)))
cm = confusion_matrix(y, pred)
print(cm)
In [35]:
print("logloss: {:.4f}".format(log_loss(y, prob)))
In [49]:
labelarr = y.ravel().tolist()
In [ ]:
labelarr
In [51]:
type(labelarr)
Out[51]:
In [14]:
data = pd.read_csv('data/all-combined-train-data-final.csv')
labels = pd.read_csv('data/sorted-train-labels.csv')
In [15]:
import array
X = data.iloc[:,1:]
ylabels = labels.iloc[:,1:].values
y = ylabels - 1
y
Out[15]:
In [16]:
xgclf = xgb.XGBClassifier(n_estimators=1000, objective="multi:softprob", nthread=4)
prob, pred = run_cv(X,y,xgclf)
print("logloss: {:.4f}".format(log_loss(y, prob)))
print("accuracy: {:.4f}".format(accuracy_score(y, pred)))
cm = confusion_matrix(y, pred)
print(cm)
In [ ]:
In [ ]:
In [9]:
data.shape
Out[9]:
In [ ]:
In [ ]:
In [10]:
name_map = {}
column_names = data.columns
for cname in column_names:
if cname not in name_map:
name_map[cname] = 1
else:
name_map[cname] += 1
if name_map[cname] > 1:
print("Feature Name: {:s} -> {:d}".format(cname, name_map[cname]))
In [11]:
call_graph_features_train = pd.read_csv('data/final-call-graph-features-10percent.csv')
In [12]:
name_map = {}
column_names = call_graph_features_train.columns
for cname in column_names:
if cname not in name_map:
name_map[cname] = 1
else:
name_map[cname] += 1
if name_map[cname] > 1:
print("Feature Name: {:s} -> {:d}".format(cname, name_map[cname]))
In [ ]:
help(data.merge)