notebook.community

Edit and run



In [1]:

    
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score
# Notes on installing xgboost for Python.
#
# If installing from source, after building and installing you have
# problems loading other packages it is because of the easy-install.pth
# file that the install script dumps in the python dist-packages
# directory. You will have to delete the easy-install.pth file then
# go change the installation of the xgboost egg and egg-info files in the 
# python dist-packages directory from:
# /usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.egg/EGG_INFO
# to
# /usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.dist-info
# and 
# /usr/local/lib/python2.7/dist-packages/xgboost-0.4-py2.7.egg/xgboost
# to
# /usr/local/lib/python2.7/dist-packages/xgboost
# Now python will be able to find all the packages.
# 
# If installing from pip, DONT EVEN TRY, it is a nightmare.
# 
# Anaconda install:
# XGBoost is not a part of the official distribution but several 
# community members have created Conda packages for it. The
# most up to date package seems to be by user creditx. The following
# command will install the package:
#
# conda install -c creditx xgboost
#



In [2]:

    
data = pd.read_csv('data/final-combined-train-data-30percent.csv')
labels = pd.read_csv('data/sorted-train-labels.csv')
datapoly = pd.read_csv('data/final-combined-train-data-30percent-poly.csv')



In [4]:

    
data.head()









    Out[4]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      ...
      ASM_964
      ASM_972
      ASM_977
      ASM_990
      trainmean
      trainstd
      trainmin
      trainmax
      traintotal
      trainlogtotal
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      750
      496
      3
      0
      0
      0
      8
      224
      49
      ...
      32
      49
      53
      10
      586.160040
      12877.609022
      0.0
      288961.0
      2.181176e+12
      28.410885
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      1121
      24
      3
      1
      4
      2
      6
      22
      7
      ...
      48
      9
      9
      116
      5.908549
      60.063976
      0.0
      1068.0
      3.790235e+05
      12.845354
    
    
      2
      01azqd4InC7m9JpocGv5
      1493
      1900
      0
      0
      0
      0
      1
      398
      0
      ...
      48
      9
      9
      116
      7.002982
      64.756651
      0.0
      1173.0
      5.319434e+05
      13.184292
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      525
      4
      0
      0
      0
      0
      0
      0
      0
      ...
      48
      9
      9
      116
      327.150099
      3278.958529
      0.0
      81305.0
      8.721682e+10
      25.191663
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      23
      35
      0
      0
      0
      0
      0
      3
      0
      ...
      48
      89
      32
      71
      5.932406
      60.189034
      0.0
      1068.0
      3.813462e+05
      12.851463
    
  

5 rows × 623 columns



In [5]:

    
labels.head()









    Out[5]:






  
    
      
      Id
      Class
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      2
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      8
    
    
      2
      01azqd4InC7m9JpocGv5
      9
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      9
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      1



In [4]:

    
datapoly.head()









    Out[4]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      ...
      train_byte_p1
      train_byte_p2
      train_byte_p3
      train_byte_p4
      train_byte_p5
      train_byte_p6
      train_byte_p7
      train_byte_p8
      train_byte_p9
      train_byte_p10
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      750
      496
      3
      0
      0
      0
      8
      224
      49
      ...
      1.0
      0.614952
      6874624.0
      0.378166
      4.227563e+06
      4.726046e+13
      0.232554
      2.599748e+06
      2.906291e+13
      3.248979e+20
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      1121
      24
      3
      1
      4
      2
      6
      22
      7
      ...
      1.0
      0.843262
      460288.0
      0.711091
      3.881435e+05
      2.118650e+11
      0.599636
      3.273068e+05
      1.786578e+11
      9.751894e+16
    
    
      2
      01azqd4InC7m9JpocGv5
      1493
      1900
      0
      0
      0
      0
      1
      398
      0
      ...
      1.0
      0.703961
      5256192.0
      0.495561
      3.700153e+06
      2.762755e+13
      0.348855
      2.604762e+06
      1.944871e+13
      1.452157e+20
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      525
      4
      0
      0
      0
      0
      0
      0
      0
      ...
      1.0
      0.806035
      4825600.0
      0.649692
      3.889601e+06
      2.328642e+13
      0.523674
      3.135154e+06
      1.876966e+13
      1.123709e+20
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      23
      35
      0
      0
      0
      0
      0
      3
      0
      ...
      1.0
      0.871610
      712704.0
      0.759704
      6.211998e+05
      5.079470e+11
      0.662165
      5.414439e+05
      4.427316e+11
      3.620159e+17
    
  

5 rows × 653 columns



In [3]:

    
# TODO: change to xgboost DMatrix
X = data.iloc[:,1:]
y = np.array(labels.iloc[:,1:] - 1)
Xpoly = datapoly.iloc[:,1:]



In [5]:

    
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(train_index, test_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train, y_train.flatten())
        #clf.fit(X_train,y_train.ravel())
        #clf.fit(X_train,y_train.ravel().tolist()) 
        # use tolist() to stop data conversion warning because sklearn preprocessing
        # does not like column vectors.
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred



In [6]:

    
xgclf = xgb.XGBClassifier(n_estimators=1000, objective="multi:softmax", nthread=4)
prob, pred = run_cv(Xpoly,y,xgclf)
print("logloss: {:.4f}".format(log_loss(y, prob)))
print("accuracy: {:.4f}".format(accuracy_score(y, pred)))
cm = confusion_matrix(y, pred)
print(cm)









    



[    0     1     2 ..., 10865 10866 10867] [    8    16    21 ..., 10844 10849 10864]
[    0     1     2 ..., 10864 10865 10866] [   23    25    38 ..., 10847 10857 10867]
[    0     1     2 ..., 10865 10866 10867] [    7     9    10 ..., 10819 10861 10863]
[    0     1     2 ..., 10865 10866 10867] [   29    36    40 ..., 10823 10824 10832]
[    0     1     2 ..., 10865 10866 10867] [    6    14    18 ..., 10808 10836 10860]
[    0     1     2 ..., 10865 10866 10867] [   11    26    32 ..., 10840 10841 10848]
[    0     1     2 ..., 10864 10866 10867] [    4    12    17 ..., 10854 10858 10865]
[    0     2     4 ..., 10865 10866 10867] [    1     3     5 ..., 10853 10855 10859]
[    1     2     3 ..., 10865 10866 10867] [    0    31    54 ..., 10850 10851 10856]
[    0     1     3 ..., 10864 10865 10867] [    2    15    30 ..., 10843 10862 10866]
logloss: 0.0081
accuracy: 0.9982
[[1541    0    0    0    0    0    0    0    0]
 [   2 2475    0    0    0    0    0    0    1]
 [   0    0 2941    0    0    0    1    0    0]
 [   0    0    0  474    0    1    0    0    0]
 [   1    0    0    0   41    0    0    0    0]
 [   4    0    0    0    1  746    0    0    0]
 [   0    0    0    0    0    0  398    0    0]
 [   0    0    0    0    0    0    0 1226    2]
 [   0    0    0    0    0    0    0    7 1006]]



In [30]:

    
xgclf = xgb.XGBClassifier(n_estimators=1000, objective="multi:softmax", nthread=4)
prob, pred = run_cv(X,y,xgclf)
print("logloss: {:.4f}".format(log_loss(y, prob)))
print("accuracy: {:.4f}".format(accuracy_score(y, pred)))
cm = confusion_matrix(y, pred)
print(cm)









    



/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/home/derek/anaconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:108: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)






    



accuracy: 0.9981
[[1540    0    0    0    0    1    0    0    0]
 [   2 2475    0    1    0    0    0    0    0]
 [   0    0 2941    0    0    0    1    0    0]
 [   0    0    0  474    0    1    0    0    0]
 [   1    0    0    0   41    0    0    0    0]
 [   4    0    0    0    1  746    0    0    0]
 [   0    0    0    0    0    0  398    0    0]
 [   0    0    0    0    0    0    0 1227    1]
 [   0    0    0    0    0    0    0    8 1005]]



In [35]:

    
print("logloss: {:.4f}".format(log_loss(y, prob)))









    



logloss: 0.0080



In [49]:

    
labelarr = y.ravel().tolist()



In [ ]:

    
labelarr



In [51]:

    
type(labelarr)









    Out[51]:





list



In [14]:

    
data = pd.read_csv('data/all-combined-train-data-final.csv')
labels = pd.read_csv('data/sorted-train-labels.csv')



In [15]:

    
import array
X = data.iloc[:,1:]
ylabels = labels.iloc[:,1:].values
y = ylabels - 1
y









    Out[15]:





array([[1],
       [7],
       [8],
       ..., 
       [3],
       [3],
       [3]])



In [16]:

    
xgclf = xgb.XGBClassifier(n_estimators=1000, objective="multi:softprob", nthread=4)
prob, pred = run_cv(X,y,xgclf)
print("logloss: {:.4f}".format(log_loss(y, prob)))
print("accuracy: {:.4f}".format(accuracy_score(y, pred)))
cm = confusion_matrix(y, pred)
print(cm)









    



[    0     3     4 ..., 10864 10865 10867] [    1     2     8 ..., 10849 10859 10866]
[    0     1     2 ..., 10864 10866 10867] [    9    16    39 ..., 10847 10854 10865]
[    0     1     2 ..., 10865 10866 10867] [   21    23    67 ..., 10817 10827 10828]
[    0     1     2 ..., 10865 10866 10867] [    6    20    30 ..., 10842 10843 10848]
[    0     1     2 ..., 10865 10866 10867] [   51    56    87 ..., 10836 10857 10864]
[    0     1     2 ..., 10865 10866 10867] [   17    22    27 ..., 10851 10858 10861]
[    1     2     5 ..., 10865 10866 10867] [    0     3     4 ..., 10855 10856 10862]
[    0     1     2 ..., 10865 10866 10867] [   11    15    19 ..., 10800 10805 10863]
[    0     1     2 ..., 10864 10865 10866] [    5     7    12 ..., 10850 10860 10867]
[    0     1     2 ..., 10865 10866 10867] [   18    33    36 ..., 10803 10838 10852]
logloss: 0.0090
accuracy: 0.9980
[[1540    0    0    0    0    1    0    0    0]
 [   2 2475    0    0    0    0    0    0    1]
 [   0    0 2942    0    0    0    0    0    0]
 [   0    0    0  474    0    1    0    0    0]
 [   1    0    0    0   41    0    0    0    0]
 [   4    0    0    0    1  746    0    0    0]
 [   0    0    0    0    0    0  398    0    0]
 [   0    0    0    0    0    0    0 1224    4]
 [   0    0    0    0    0    0    0    7 1006]]



In [ ]:



In [ ]:



In [9]:

    
data.shape









    Out[9]:





(10868, 2184)



In [ ]:



In [ ]:

2. Test Code Only



In [10]:

    
name_map = {}
column_names = data.columns
for cname in column_names:
    if cname not in name_map:
        name_map[cname] = 1
    else:
        name_map[cname] += 1
    if name_map[cname] > 1:
        print("Feature Name: {:s} -> {:d}".format(cname, name_map[cname]))









    



Feature Name: eax_x.1 -> 2
Feature Name: eax_y.1 -> 2
Feature Name: ebx_x.1 -> 2
Feature Name: ebx_y.1 -> 2
Feature Name: ecx_x.1 -> 2
Feature Name: ecx_y.1 -> 2
Feature Name: edi_x.1 -> 2
Feature Name: edi_y.1 -> 2
Feature Name: edx_x.1 -> 2
Feature Name: edx_y.1 -> 2



In [11]:

    
call_graph_features_train = pd.read_csv('data/final-call-graph-features-10percent.csv')



In [12]:

    
name_map = {}
column_names = call_graph_features_train.columns
for cname in column_names:
    if cname not in name_map:
        name_map[cname] = 1
    else:
        name_map[cname] += 1
    if name_map[cname] > 1:
        print("Feature Name: {:s} -> {:d}".format(cname, name_map[cname]))



In [ ]:

    
help(data.merge)

	filename	edx	esi	es	ds	ss	cs	ah	al	ax	...	ASM_964	ASM_972	ASM_977	ASM_990	trainmean	trainstd	trainmax	traintotal	trainlogtotal
0	01IsoiSMh5gxyDYTl4CB	750	496	3	0	0	0	8	224	49	...	32	49	53	10	586.160040	12877.609022	288961.0	2.181176e+12	28.410885
1	01SuzwMJEIXsK7A8dQbl	1121	24	3	1	4	2	6	22	7	...	48	9	9	116	5.908549	60.063976	1068.0	3.790235e+05	12.845354
2	01azqd4InC7m9JpocGv5	1493	1900	0	0	0	0	1	398	0	...	48	9	9	116	7.002982	64.756651	1173.0	5.319434e+05	13.184292
3	01jsnpXSAlgw6aPeDxrU	525	4	0	0	0	0	0	0	0	...	48	9	9	116	327.150099	3278.958529	81305.0	8.721682e+10	25.191663
4	01kcPWA9K2BOxQeS5Rju	23	35	0	0	0	0	0	3	0	...	48	89	32	71	5.932406	60.189034	1068.0	3.813462e+05	12.851463

	filename	edx	esi	es	ds	ss	cs	ah	al	ax	...	train_byte_p1	train_byte_p2	train_byte_p3	train_byte_p4	train_byte_p5	train_byte_p6	train_byte_p7	train_byte_p8	train_byte_p9	train_byte_p10
0	01IsoiSMh5gxyDYTl4CB	750	496	3	0	0	0	8	224	49	...	1.0	0.614952	6874624.0	0.378166	4.227563e+06	4.726046e+13	0.232554	2.599748e+06	2.906291e+13	3.248979e+20
1	01SuzwMJEIXsK7A8dQbl	1121	24	3	1	4	2	6	22	7	...	1.0	0.843262	460288.0	0.711091	3.881435e+05	2.118650e+11	0.599636	3.273068e+05	1.786578e+11	9.751894e+16
2	01azqd4InC7m9JpocGv5	1493	1900	0	0	0	0	1	398	0	...	1.0	0.703961	5256192.0	0.495561	3.700153e+06	2.762755e+13	0.348855	2.604762e+06	1.944871e+13	1.452157e+20
3	01jsnpXSAlgw6aPeDxrU	525	4	0	0	0	0	0	0	0	...	1.0	0.806035	4825600.0	0.649692	3.889601e+06	2.328642e+13	0.523674	3.135154e+06	1.876966e+13	1.123709e+20
4	01kcPWA9K2BOxQeS5Rju	23	35	0	0	0	0	0	3	0	...	1.0	0.871610	712704.0	0.759704	6.211998e+05	5.079470e+11	0.662165	5.414439e+05	4.427316e+11	3.620159e+17