In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score
# Notes on installing xgboost for Python.
#
# If installing from source, after building and installing you have
# problems loading other packages it is because of the easy-install.pth
# file that the install script dumps in the python dist-packages
# directory. You will have to delete the easy-install.pth file then
# go change the installation of the xgboost egg and egg-info files in the 
# python dist-packages directory from:
# /usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.egg/EGG_INFO
# to
# /usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.dist-info
# and 
# /usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.egg/xgboost
# to
# /usr/local/lib/python2.7/dist-packages/xgboost
# Now python will be able to find all the packages.
# 
# If installing from pip, DONT EVEN TRY, it is a nightmare.
# 
# Anaconda install:
# XGBoost is not a part of the official distribution but several 
# community members have created Conda packages for it. The
# most up to date package seems to be by user creditx. The following
# command will install the package:
#
# conda install -c creditx xgboost
#

In [2]:
data = pd.read_csv('data/final-combined-train-data-30percent.csv')
labels = pd.read_csv('data/sorted-train-labels.csv')
datapoly = pd.read_csv('data/final-combined-train-data-30percent-poly.csv')

In [4]:
data.head()


Out[4]:
filename edx esi es ds ss cs ah al ax ... ASM_964 ASM_972 ASM_977 ASM_990 trainmean trainstd trainmin trainmax traintotal trainlogtotal
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 ... 32 49 53 10 586.160040 12877.609022 0.0 288961.0 2.181176e+12 28.410885
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 ... 48 9 9 116 5.908549 60.063976 0.0 1068.0 3.790235e+05 12.845354
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 ... 48 9 9 116 7.002982 64.756651 0.0 1173.0 5.319434e+05 13.184292
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 ... 48 9 9 116 327.150099 3278.958529 0.0 81305.0 8.721682e+10 25.191663
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 ... 48 89 32 71 5.932406 60.189034 0.0 1068.0 3.813462e+05 12.851463

5 rows × 623 columns


In [5]:
labels.head()


Out[5]:
Id Class
0 01IsoiSMh5gxyDYTl4CB 2
1 01SuzwMJEIXsK7A8dQbl 8
2 01azqd4InC7m9JpocGv5 9
3 01jsnpXSAlgw6aPeDxrU 9
4 01kcPWA9K2BOxQeS5Rju 1

In [4]:
datapoly.head()


Out[4]:
filename edx esi es ds ss cs ah al ax ... train_byte_p1 train_byte_p2 train_byte_p3 train_byte_p4 train_byte_p5 train_byte_p6 train_byte_p7 train_byte_p8 train_byte_p9 train_byte_p10
0 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 ... 1.0 0.614952 6874624.0 0.378166 4.227563e+06 4.726046e+13 0.232554 2.599748e+06 2.906291e+13 3.248979e+20
1 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 ... 1.0 0.843262 460288.0 0.711091 3.881435e+05 2.118650e+11 0.599636 3.273068e+05 1.786578e+11 9.751894e+16
2 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 ... 1.0 0.703961 5256192.0 0.495561 3.700153e+06 2.762755e+13 0.348855 2.604762e+06 1.944871e+13 1.452157e+20
3 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 ... 1.0 0.806035 4825600.0 0.649692 3.889601e+06 2.328642e+13 0.523674 3.135154e+06 1.876966e+13 1.123709e+20
4 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 ... 1.0 0.871610 712704.0 0.759704 6.211998e+05 5.079470e+11 0.662165 5.414439e+05 4.427316e+11 3.620159e+17

5 rows × 653 columns


In [3]:
# TODO: change to xgboost DMatrix
X = data.iloc[:,1:]
y = np.array(labels.iloc[:,1:] - 1)
Xpoly = datapoly.iloc[:,1:]

In [5]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(train_index, test_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train, y_train.flatten())
        #clf.fit(X_train,y_train.ravel())
        #clf.fit(X_train,y_train.ravel().tolist()) 
        # use tolist() to stop data conversion warning because sklearn preprocessing
        # does not like column vectors.
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred

In [6]:
xgclf = xgb.XGBClassifier(n_estimators=1000, objective="multi:softmax", nthread=4)
prob, pred = run_cv(Xpoly,y,xgclf)
print("logloss: {:.4f}".format(log_loss(y, prob)))
print("accuracy: {:.4f}".format(accuracy_score(y, pred)))
cm = confusion_matrix(y, pred)
print(cm)


[    0     1     2 ..., 10865 10866 10867] [    8    16    21 ..., 10844 10849 10864]
[    0     1     2 ..., 10864 10865 10866] [   23    25    38 ..., 10847 10857 10867]
[    0     1     2 ..., 10865 10866 10867] [    7     9    10 ..., 10819 10861 10863]
[    0     1     2 ..., 10865 10866 10867] [   29    36    40 ..., 10823 10824 10832]
[    0     1     2 ..., 10865 10866 10867] [    6    14    18 ..., 10808 10836 10860]
[    0     1     2 ..., 10865 10866 10867] [   11    26    32 ..., 10840 10841 10848]
[    0     1     2 ..., 10864 10866 10867] [    4    12    17 ..., 10854 10858 10865]
[    0     2     4 ..., 10865 10866 10867] [    1     3     5 ..., 10853 10855 10859]
[    1     2     3 ..., 10865 10866 10867] [    0    31    54 ..., 10850 10851 10856]
[    0     1     3 ..., 10864 10865 10867] [    2    15    30 ..., 10843 10862 10866]
logloss: 0.0081
accuracy: 0.9982
[[1541    0    0    0    0    0    0    0    0]
 [   2 2475    0    0    0    0    0    0    1]
 [   0    0 2941    0    0    0    1    0    0]
 [   0    0    0  474    0    1    0    0    0]
 [   1    0    0    0   41    0    0    0    0]
 [   4    0    0    0    1  746    0    0    0]
 [   0    0    0    0    0    0  398    0    0]
 [   0    0    0    0    0    0    0 1226    2]
 [   0    0    0    0    0    0    0    7 1006]]

In [30]:
xgclf = xgb.XGBClassifier(n_estimators=1000, objective="multi:softmax", nthread=4)
prob, pred = run_cv(X,y,xgclf)
print("logloss: {:.4f}".format(log_loss(y, prob)))
print("accuracy: {:.4f}".format(accuracy_score(y, pred)))
cm = confusion_matrix(y, pred)
print(cm)


/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
accuracy: 0.9981
[[1540    0    0    0    0    1    0    0    0]
 [   2 2475    0    1    0    0    0    0    0]
 [   0    0 2941    0    0    0    1    0    0]
 [   0    0    0  474    0    1    0    0    0]
 [   1    0    0    0   41    0    0    0    0]
 [   4    0    0    0    1  746    0    0    0]
 [   0    0    0    0    0    0  398    0    0]
 [   0    0    0    0    0    0    0 1227    1]
 [   0    0    0    0    0    0    0    8 1005]]

In [35]:
print("logloss: {:.4f}".format(log_loss(y, prob)))


logloss: 0.0080

In [49]:
labelarr = y.ravel().tolist()

In [ ]:
labelarr

In [51]:
type(labelarr)


Out[51]:
list

In [14]:
data = pd.read_csv('data/all-combined-train-data-final.csv')
labels = pd.read_csv('data/sorted-train-labels.csv')

In [15]:
import array
X = data.iloc[:,1:]
ylabels = labels.iloc[:,1:].values
y = ylabels - 1
y


Out[15]:
array([[1],
       [7],
       [8],
       ..., 
       [3],
       [3],
       [3]])

In [16]:
xgclf = xgb.XGBClassifier(n_estimators=1000, objective="multi:softprob", nthread=4)
prob, pred = run_cv(X,y,xgclf)
print("logloss: {:.4f}".format(log_loss(y, prob)))
print("accuracy: {:.4f}".format(accuracy_score(y, pred)))
cm = confusion_matrix(y, pred)
print(cm)


[    0     3     4 ..., 10864 10865 10867] [    1     2     8 ..., 10849 10859 10866]
[    0     1     2 ..., 10864 10866 10867] [    9    16    39 ..., 10847 10854 10865]
[    0     1     2 ..., 10865 10866 10867] [   21    23    67 ..., 10817 10827 10828]
[    0     1     2 ..., 10865 10866 10867] [    6    20    30 ..., 10842 10843 10848]
[    0     1     2 ..., 10865 10866 10867] [   51    56    87 ..., 10836 10857 10864]
[    0     1     2 ..., 10865 10866 10867] [   17    22    27 ..., 10851 10858 10861]
[    1     2     5 ..., 10865 10866 10867] [    0     3     4 ..., 10855 10856 10862]
[    0     1     2 ..., 10865 10866 10867] [   11    15    19 ..., 10800 10805 10863]
[    0     1     2 ..., 10864 10865 10866] [    5     7    12 ..., 10850 10860 10867]
[    0     1     2 ..., 10865 10866 10867] [   18    33    36 ..., 10803 10838 10852]
logloss: 0.0090
accuracy: 0.9980
[[1540    0    0    0    0    1    0    0    0]
 [   2 2475    0    0    0    0    0    0    1]
 [   0    0 2942    0    0    0    0    0    0]
 [   0    0    0  474    0    1    0    0    0]
 [   1    0    0    0   41    0    0    0    0]
 [   4    0    0    0    1  746    0    0    0]
 [   0    0    0    0    0    0  398    0    0]
 [   0    0    0    0    0    0    0 1224    4]
 [   0    0    0    0    0    0    0    7 1006]]

In [ ]:


In [ ]:


In [9]:
data.shape


Out[9]:
(10868, 2184)

In [ ]:


In [ ]:

2. Test Code Only


In [10]:
name_map = {}
column_names = data.columns
for cname in column_names:
    if cname not in name_map:
        name_map[cname] = 1
    else:
        name_map[cname] += 1
    if name_map[cname] > 1:
        print("Feature Name: {:s} -> {:d}".format(cname, name_map[cname]))


Feature Name: eax_x.1 -> 2
Feature Name: eax_y.1 -> 2
Feature Name: ebx_x.1 -> 2
Feature Name: ebx_y.1 -> 2
Feature Name: ecx_x.1 -> 2
Feature Name: ecx_y.1 -> 2
Feature Name: edi_x.1 -> 2
Feature Name: edi_y.1 -> 2
Feature Name: edx_x.1 -> 2
Feature Name: edx_y.1 -> 2

In [11]:
call_graph_features_train = pd.read_csv('data/final-call-graph-features-10percent.csv')

In [12]:
name_map = {}
column_names = call_graph_features_train.columns
for cname in column_names:
    if cname not in name_map:
        name_map[cname] = 1
    else:
        name_map[cname] += 1
    if name_map[cname] > 1:
        print("Feature Name: {:s} -> {:d}".format(cname, name_map[cname]))

In [ ]:
help(data.merge)