Java Class File Analysis

In this notebook we're going to explore, understand, and classify java class files as being 'benign' or 'malicious'. We will explore the data, apply machine learning algorithms to the data, add new features, do more machine learning. Then we will test our classifier on a large amount of files to measure it's effectiveness.

References

http://docs.oracle.com/javase/specs/jvms/se7/html/jvms-4.html

</font>

Python Modules Used:

Pandas: Python Data Analysis Library (http://pandas.pydata.org)

Scikit Learn (http://scikit-learn.org) Scikit-learn: Machine Learning in Python, Pedregosa et al., JMLR 12, pp. 2825-2830, 2011.

Matplotlib: Python 2D plotting library (http://matplotlib.org) </ul> </font>

IPython Notebook for this talk: http://clicksecurity.github.io/data_hacking

Imports and plot defaults



In [117]:

    
import pandas as pd
print 'pandas version is', pd.__version__
import numpy as np
print 'numpy version is', np.__version__
import sklearn
print 'scikit-learn version is', sklearn.__version__
import matplotlib
print 'matplotlib version is', matplotlib.__version__
import matplotlib.pyplot as plt









    



pandas version is 0.13.0
numpy version is 1.7.1
scikit-learn version is 0.14.1
matplotlib version is 1.4.1

Plotting defaults and helper functions



In [118]:

    
%matplotlib inline
plt.rcParams['font.size'] = 18.0
plt.rcParams['figure.figsize'] = 16.0, 5.0



In [119]:

    
def plot_cm(cm, labels):
    # Compute percentanges
    percent = (cm*100.0)/np.array(np.matrix(cm.sum(axis=1)).T)
    print 'Confusion Matrix Stats'
    for i, label_i in enumerate(labels):
        for j, label_j in enumerate(labels):
            print "%s/%s: %.2f%% (%d/%d)" % (label_i, label_j, (percent[i][j]), cm[i][j], cm[i].sum())

    # Show confusion matrix
    # Thanks to kermit666 from stackoverflow
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.grid(b=False)
    cax = ax.matshow(percent, cmap='coolwarm',vmin=0,vmax=100)
    plt.title('')
    fig.colorbar(cax)
    ax.set_xticklabels([''] + labels)
    ax.set_yticklabels([''] + labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()



In [120]:

    
def extract_character_info(string):
    lowercase_runs = []
    uppercase_runs = []
    digit_runs = []
    lower = map(str.islower, str(string))
    upper = map(str.isupper, str(string))
    digits = map(str.isdigit, str(string))
    
    current_length = 0
    current = False
    for l in lower:
        if l:
            current_length += 1
            current = True
        else:
            if current:
                lowercase_runs.append(current_length)
                current_length = 0
            current = False
    if current:
        lowercase_runs.append(current_length)

    current_length = 0
    current = False
    for u in upper:
        if u:
            current_length += 1
            current = True
        else:
            if current:
                uppercase_runs.append(current_length)
                current_length = 0
            current = False
    if current:
        uppercase_runs.append(current_length)

    current_length = 0
    current = False
    for d in digits:
        if d:
            current_length += 1
            current = True
        else:
            if current:
                digit_runs.append(current_length)
                current_length = 0
            current = False
    if current:
        digit_runs.append(current_length)
    
    return lowercase_runs, uppercase_runs, digit_runs

Function to extract out the features we are interested with from the json blob



In [121]:

    
def extract_features(data):
    features = {}
    try:
        features['sha256'] = data['metadata']['sha256']
        features['size'] = data['metadata']['file_size']
        features['entropy'] = data['metadata']['entropy']
        if 'sourcefile' in data['characteristics']['java']:
            features['source file'] = data['characteristics']['java']['sourcefile']
        else:
            features['source file'] = 'No Source File'
        if 'access_permissions' in data['characteristics']['java']:
            features['ap_count'] = len(data['characteristics']['java']['access_permissions'])
            for ap in data['characteristics']['java']['access_permissions']:
                features[str.lower(str(ap).replace(" ", "_"))] = 1
        features['class name'] = data['characteristics']['java']['class_name']
        features['class_name_slash_count'] = features['class name'].count('/')
        features['class_name_length'] = len(features['class name'])
        cn_lowercase_runs, cn_uppercase_runs, cn_digit_runs = extract_character_info(features['class name'])
        cn_lowercase_run_longest = 0
        cn_lowercase_run_average = 0
        cn_uppercase_run_longest = 0
        cn_uppercase_run_average = 0
        cn_digit_run_longest = 0
        cn_digit_run_average = 0
        if cn_lowercase_runs:
            cn_lowercase_run_longest = np.max(cn_lowercase_runs)
            cn_lowercase_run_average = np.mean(cn_lowercase_runs)
        features['class_name_lowercase_run_longest'] = cn_lowercase_run_longest
        features['class_name_lowercase_run_avg'] = cn_lowercase_run_average
                    
        if cn_uppercase_runs:
            cn_uppercase_run_longest = np.max(cn_uppercase_runs)
            cn_uppercase_run_average = np.mean(cn_uppercase_runs)
        features['class_name_uppercase_run_longest'] = cn_uppercase_run_longest
        features['class_name_uppercase_run_avg'] = cn_uppercase_run_average

        if cn_digit_runs:
            cn_digit_run_longest = np.max(cn_digit_runs)
            cn_digit_run_average = np.mean(cn_digit_runs)
        features['class_name_digit_run_longest'] = cn_digit_run_longest
        features['class_name_digit_run_avg'] = cn_digit_run_average
        
        features['major version'] = data['characteristics']['java']['major_version']
        features['minor version'] = data['characteristics']['java']['minor_version']
        if 'method_names' in data['characteristics']['java']:
            features['method names'] = data['characteristics']['java']['method_names']
        else:
            features['method names'] = []
        features['methods_count'] = len(features['method names'])
        lowercase_run_longest = 0
        lowercase_run_average = 0 
        lowercase_runs = []
        uppercase_run_longest = 0
        uppercase_run_average = 0 
        uppercase_runs = []
        digit_run_longest = 0
        digit_run_average = 0
        digit_runs = []

        for method in features['method names']:
            lc, uc, d = extract_character_info(method)
            lowercase_runs.extend(lc)
            uppercase_runs.extend(uc)
            digit_runs.extend(d)
            
        if lowercase_runs:
            lowercase_run_longest = np.max(lowercase_runs)
            lowercase_run_average = np.mean(lowercase_runs)
        features['method_name_lowercase_run_longest'] = lowercase_run_longest
        features['method_name_lowercase_run_avg'] = lowercase_run_average
                    
        if uppercase_runs:
            uppercase_run_longest = np.max(uppercase_runs)
            uppercase_run_average = np.mean(uppercase_runs)
        features['method_name_uppercase_run_longest'] = uppercase_run_longest
        features['method_name_uppercase_run_avg'] = uppercase_run_average

        if digit_runs:
            digit_run_longest = np.max(digit_runs)
            digit_run_average = np.mean(digit_runs)
        features['method_name_digit_run_longest'] = digit_run_longest
        features['method_name_digit_run_avg'] = digit_run_average
        
        if 'interfaces' in data['characteristics']['java']:
            features['interfaces'] = data['characteristics']['java']['interfaces']
        else:
            features['interfaces'] = []
        features['interface_count'] = len(features['interfaces'])
        features['constant_pool_count'] = data['characteristics']['java']['const_pool_count']

    except KeyError as ke:
        print 'ERROR:', ke, data['metadata']['sha256']
    return features



In [122]:

    
def load_files(file_list):
    import json
    features_list = []
    for filename in file_list:
        with open(filename,'rb') as f:
            features = extract_features(json.loads(f.read()))
            features_list.append(features)
    return features_list

We read in the benign and malicious files



In [123]:

    
# Good files
import glob

good_list = glob.glob('data/clean/*.results')
good_features = load_files(good_list)
print "Files:", len(good_list)









    



Files: 500



In [124]:

    
# Bad files
bad_list = glob.glob('data/malicious/*.results')
bad_features = load_files(bad_list)
print "Files:", len(bad_list)









    



Files: 520



In [125]:

    
df_good = pd.DataFrame.from_records(good_features)
df_good.fillna(0, inplace=True)
df_good['label'] = 'benign'
df_good.head()









    Out[125]:






  
    
      
      acc_abstract
      acc_annotation
      acc_enum
      acc_final
      acc_interface
      acc_public
      acc_super
      acc_synthetic
      ap_count
      class name
      class_name_digit_run_avg
      class_name_digit_run_longest
      class_name_length
      class_name_lowercase_run_avg
      class_name_lowercase_run_longest
      class_name_slash_count
      class_name_uppercase_run_avg
      class_name_uppercase_run_longest
      constant_pool_count
      entropy
      
    
  
  
    
      0
       1
       0
       0
       0
       0
       1
       1
       0
       3
       com/google/common/collect/ForwardingConcurrentMap
       0
       0
       49
       6.000000
        9
       4
       1.0
       1
        54
       4.990507
      ...
    
    
      1
       0
       0
       0
       0
       0
       0
       1
       0
       1
       org/apache/hadoop/io/compress/GzipCodec$GzipOu...
       0
       0
       82
       4.846154
        8
       5
       1.5
       5
        39
       5.205063
      ...
    
    
      2
       0
       0
       0
       1
       0
       0
       1
       0
       2
       com/google/common/collect/Multisets$Unmodifiab...
       0
       0
       62
       6.625000
       11
       4
       1.0
       1
       131
       4.996721
      ...
    
    
      3
       0
       0
       0
       0
       0
       0
       1
       0
       1
              hu/openig/mechanics/StaticDefensePlanner$1
       1
       1
       42
       5.666667
        9
       3
       1.0
       1
        56
       5.282413
      ...
    
    
      4
       0
       0
       0
       0
       0
       1
       1
       0
       2
                      org/apache/commons/io/LineIterator
       0
       0
       34
       4.666667
        7
       4
       1.0
       1
        95
       5.285082
      ...
    
  

5 rows × 36 columns



In [126]:

    
df_bad = pd.DataFrame.from_records(bad_features)
df_bad.fillna(0, inplace=True)
df_bad['label'] = 'malicious'
df_bad.head()









    Out[126]:






  
    
      
      acc_final
      acc_public
      acc_super
      ap_count
      class name
      class_name_digit_run_avg
      class_name_digit_run_longest
      class_name_length
      class_name_lowercase_run_avg
      class_name_lowercase_run_longest
      class_name_slash_count
      class_name_uppercase_run_avg
      class_name_uppercase_run_longest
      constant_pool_count
      entropy
      interface_count
      interfaces
      major version
      method names
      method_name_digit_run_avg
      
    
  
  
    
      0
       0
       1
       1
       2
                              Main
       0
       0
        4
        3.0
        3
       0
       1.000000
       1
        86
       6.114522
       0
       []
       48
                                          [<init>, init]
       0
      ...
    
    
      1
       0
       0
       1
       1
                  YdCdHX/VcZaXVjyy
       0
       0
       16
        1.4
        3
       1
       1.333333
       2
        52
       5.539514
       0
       []
       49
                            [<init>, ktCgxlqo, <clinit>]
       0
      ...
    
    
      2
       0
       1
       1
       2
                            aOcMSp
       0
       0
        6
        1.0
        1
       0
       1.500000
       2
       159
       5.953528
       0
       []
       49
                               [<init>, gvuNr, <clinit>]
       0
      ...
    
    
      3
       0
       0
       1
       1
               a/zylasqwjlpbqyrwrr
       0
       0
       19
        9.0
       17
       1
       0.000000
       0
       478
       6.348531
       0
       []
       49
       [<init>, eiaxyercdfvbgscpbv, yginlmcynkyuohnfh...
       0
      ...
    
    
      4
       0
       1
       1
       2
       tljpjunbjwtqlywm/sdnrybknlf
       0
       0
       27
       13.0
       16
       1
       0.000000
       0
       122
       5.376762
       0
       []
       49
                              [<init>, dvvwse, <clinit>]
       0
      ...
    
  

5 rows × 31 columns



In [127]:

    
df = pd.concat([df_bad, df_good], ignore_index=True)
df.fillna(0, inplace=True)

Let's explore the data.

We start by comparing the file sizes.



In [129]:

    
df.boxplot(column='size', by='label')
plt.ylabel('File Size')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[129]:





<matplotlib.text.Text at 0x12b873d90>

Let's zoom in. We can see some separation between the files here.



In [130]:

    
df.boxplot(column='size', by='label')
plt.ylabel('File Size')
plt.xlabel('')
plt.title('')
plt.suptitle('')
plt.ylim(0, 15000)









    Out[130]:





(0, 15000)

Next we compare the entropy.



In [131]:

    
df.boxplot('entropy', 'label')
plt.ylabel('Entropy')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[131]:





<matplotlib.text.Text at 0x12b8ab5d0>

Next we compare the number of items in the constant pool



In [132]:

    
df.boxplot(column='constant_pool_count', by='label')
plt.ylabel('Constant Pool Count')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[132]:





<matplotlib.text.Text at 0x12b8b8650>

It's hard to see the detail, so we zoom in



In [133]:

    
df.boxplot(column='constant_pool_count', by='label')
plt.xlabel('')
plt.ylabel('Constant Pool Count')
plt.title('')
plt.suptitle('')
plt.ylim(0, 1000)









    Out[133]:





(0, 1000)

Here we see the number of methods the class file has.



In [134]:

    
df.boxplot(column='methods_count', by='label')
plt.ylabel('Number of Methods')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[134]:





<matplotlib.text.Text at 0x12b8b1210>

Finally, we check out the number of interfaces the class file has.



In [135]:

    
df.boxplot(column='interface_count', by='label')
plt.ylabel('Number of Interfaces')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[135]:





<matplotlib.text.Text at 0x12b68ced0>

First we try classifying binaries just based on some simple features we already have



In [136]:

    
my_seed = 1022
my_tsize = .2



In [137]:

    
import sklearn.ensemble
clf_simple = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
simple_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
                    'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
                    'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version']

X = df.as_matrix(simple_features)
y = np.array(df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_simple, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))









    



Accuracy: 0.927 (+/- 0.044)

Not terribly great, but we're just getting started.



In [138]:

    
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_simple.fit(X_train, y_train)
y_pred = clf_simple.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)









    



Confusion Matrix Stats
benign/benign: 93.07% (94/101)
benign/malicious: 6.93% (7/101)
malicious/benign: 10.68% (11/103)
malicious/malicious: 89.32% (92/103)

Here we have the classifier tell use which features were most important and how important they were. The entropy and constant pool count are the two most important features by far.



In [139]:

    
importances = zip(simple_features, clf_simple.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
for idx, im in enumerate(importances):
    print (str(idx+1) + ':').ljust(4), im[0].ljust(20), round(im[1], 5)









    



1:   entropy              0.29813
2:   constant_pool_count  0.24814
3:   size                 0.14489
4:   methods_count        0.08492
5:   interface_count      0.08152
6:   major version        0.05055
7:   ap_count             0.02908
8:   acc_public           0.02446
9:   acc_final            0.01284
10:  acc_abstract         0.00982
11:  minor version        0.00623
12:  acc_super            0.00541
13:  acc_interface        0.00275
14:  acc_enum             0.00125
15:  acc_synthetic        2e-05
16:  acc_annotation       -0.0

What about the method names?

Next we will explore the names of the methods



In [145]:

    
bad = []
good = []
for strings, label in zip(df['method names'], df['label']):
    for name in strings:
        d = {'method name': name}
        if label == 'malicious' and d not in bad:
            bad.append(d)
        elif label == 'benign' and d not in good:
            good.append(d)

df_method_names_bad = pd.DataFrame.from_records(bad)
df_method_names_good = pd.DataFrame.from_records(good)



In [146]:

    
df_method_names_bad.head(50)









    Out[146]:






  
    
      
      method name
    
  
  
    
      0 
                      <init>
    
    
      1 
                        init
    
    
      2 
                    ktCgxlqo
    
    
      3 
                    <clinit>
    
    
      4 
                       gvuNr
    
    
      5 
          eiaxyercdfvbgscpbv
    
    
      6 
          yginlmcynkyuohnfhe
    
    
      7 
             mtyvzetsjhvnbyz
    
    
      8 
          fxxhgjttqfavlooxcb
    
    
      9 
         wyjgamzmowywjihkuuf
    
    
      10
       kgthsnqdqutacivcptong
    
    
      11
       qgasjqrogibkblyzourtq
    
    
      12
         glfouhczfxzyskaystx
    
    
      13
            mikczoanebdkwpyb
    
    
      14
            bwssduenvebnvgix
    
    
      15
         wafrcwijizypmitodmb
    
    
      16
           bfznyeevclzzxxqbw
    
    
      17
             jmzisxwtxhekbkl
    
    
      18
          szivddjiptybevduli
    
    
      19
         forwnxmgnutbtdwvptj
    
    
      20
           mwwmrvljafpkwzdiy
    
    
      21
           vvpbdzrhvvnzaieyi
    
    
      22
           qkkxoygluwwlnwbxu
    
    
      23
                      dvvwse
    
    
      24
                           c
    
    
      25
                           k
    
    
      26
                        main
    
    
      27
           writeEmbeddedFile
    
    
      28
                   bootstrap
    
    
      29
            getJreExecutable
    
    
      30
                addExtension
    
    
      31
                   findInDir
    
    
      32
                   normalize
    
    
      33
                     dissect
    
    
      34
                      class$
    
    
      35
                    tgznSIAR
    
    
      36
                     kWfVWtw
    
    
      37
                    BodFzDax
    
    
      38
                      xXVBwx
    
    
      39
                   VdJiGyZfj
    
    
      40
                  taddhnwrkj
    
    
      41
                           C
    
    
      42
               ALLATORI_DEMO
    
    
      43
            jvsamhqyvgekftsj
    
    
      44
                       knjkb
    
    
      45
                           B
    
    
      46
                      cmjnkr
    
    
      47
                      jmdpes
    
    
      48
                   tqffjybms
    
    
      49
                      vtvtmh
    
  

50 rows × 1 columns



In [147]:

    
df_method_names_good.head(50)









    Out[147]:






  
    
      
      method name
    
  
  
    
      0 
                   <init>
    
    
      1 
                 delegate
    
    
      2 
              putIfAbsent
    
    
      3 
                   remove
    
    
      4 
                  replace
    
    
      5 
               resetState
    
    
      6 
               comparator
    
    
      7 
         createElementSet
    
    
      8 
               elementSet
    
    
      9 
       descendingMultiset
    
    
      10
               firstEntry
    
    
      11
                lastEntry
    
    
      12
           pollFirstEntry
    
    
      13
            pollLastEntry
    
    
      14
             headMultiset
    
    
      15
              subMultiset
    
    
      16
             tailMultiset
    
    
      17
                   invoke
    
    
      18
                  hasNext
    
    
      19
              isValidLine
    
    
      20
                     next
    
    
      21
                 nextLine
    
    
      22
                    close
    
    
      23
             closeQuietly
    
    
      24
                     exec
    
    
      25
               getInitial
    
    
      26
              getIntermed
    
    
      27
                 getFinal
    
    
      28
                      max
    
    
      29
             outputSchema
    
    
      30
           estimateLength
    
    
      31
                 appendTo
    
    
      32
                 getXPath
    
    
      33
                      run
    
    
      34
                 secToHMS
    
    
      35
               contribute
    
    
      36
           onBeforeRender
    
    
      37
            setCloseEvent
    
    
      38
           setSelectEvent
    
    
      39
           setChangeEvent
    
    
      40
                setSource
    
    
      41
                statement
    
    
      42
       setDocumentLocator
    
    
      43
            startDocument
    
    
      44
              endDocument
    
    
      45
       startPrefixMapping
    
    
      46
         endPrefixMapping
    
    
      47
             startElement
    
    
      48
               endElement
    
    
      49
               characters
    
  

50 rows × 1 columns

When we look at method names, we see that the benign ones look "normal" and while some of the malicioius ones look "normal", some look like gibberish. To try to capture that feature, we extract the longest run of consecutive lowercase, uppercase, and digits from the methods name. We also calculate the average run of these sequences as well.



In [148]:

    
df.boxplot('method_name_lowercase_run_longest', 'label')
plt.ylabel('Max length of lower case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[148]:





<matplotlib.text.Text at 0x112560510>



In [149]:

    
df.boxplot('method_name_lowercase_run_avg', 'label')
plt.ylabel('Avg length of lower case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[149]:





<matplotlib.text.Text at 0x113767090>



In [150]:

    
df.boxplot('method_name_uppercase_run_longest', 'label')
plt.ylabel('Max length of upper case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[150]:





<matplotlib.text.Text at 0x1136dd650>



In [151]:

    
df.boxplot('method_name_uppercase_run_avg', 'label')
plt.ylabel('Avg length of upper case letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[151]:





<matplotlib.text.Text at 0x10ea06190>



In [152]:

    
df.boxplot('method_name_digit_run_longest', 'label')
plt.ylabel('Max length of digits')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[152]:





<matplotlib.text.Text at 0x10ea53810>



In [153]:

    
df.boxplot('method_name_digit_run_avg', 'label')
plt.ylabel('Avg length of digits')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[153]:





<matplotlib.text.Text at 0x10eabb2d0>

We train the classifier using these new features.



In [154]:

    
import sklearn.ensemble
clf_methods = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
method_name_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
                    'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
                    'entropy', 'size', 'interface_count', 'major version', 'methods_count',
                    'minor version',
                    'method_name_digit_run_avg', 'method_name_digit_run_longest',
                    'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
                    'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest']

X = df.as_matrix(method_name_features)
y = np.array(df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_methods, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))









    



Accuracy: 0.950 (+/- 0.037)

We do see improvement in the classifier, so that is encouraging. But definitely still work to be done.



In [155]:

    
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_methods.fit(X_train, y_train)
y_pred = clf_methods.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)









    



Confusion Matrix Stats
benign/benign: 94.90% (93/98)
benign/malicious: 5.10% (5/98)
malicious/benign: 10.38% (11/106)
malicious/malicious: 89.62% (95/106)

We check the importance from the classifier again. The method names features are not the most important features, but they rank relatively relatively high in importance. This explains the only slight improvement in the classifier.

importances = zip(method_name_features, clf_methods.featureimportances) importances.sort(key=lambda k:k[1], reverse=True) for idx, im in enumerate(importances[0:15]): print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5)

What about the class names?



In [157]:

    
for idx, gcn in enumerate(df_good['class name']):
    print gcn
    if idx == 19:
        break









    



com/google/common/collect/ForwardingConcurrentMap
org/apache/hadoop/io/compress/GzipCodec$GzipOutputStream$ResetableGZIPOutputStream
com/google/common/collect/Multisets$UnmodifiableSortedMultiset
hu/openig/mechanics/StaticDefensePlanner$1
org/apache/commons/io/LineIterator
org/apache/pig/builtin/IntMax
org/apache/commons/lang/time/FastDateFormat$StringLiteral
hu/openig/screen/items/ResearchProductionScreen$15
org/dom4j/XPathException
threadWordlistExec
org/odlabs/wiquery/ui/autocomplete/AbstractAutocompleteComponent$InnerAutocomplete
org/xml/sax/ContentHandler
org/apache/commons/httpclient/protocol/SSLProtocolSocketFactory
org/apache/pig/PigException
org/junit/runners/Enclosed
org/jets3t/service/io/ProgressMonitoredInputStream
org/apache/pig/impl/util/CastUtils
com/google/common/base/Joiner$2
com/google/common/io/CharStreams
org/apache/commons/compress/archivers/cpio/CpioArchiveOutputStream



In [158]:

    
for idx, gcn in enumerate(df_bad['class name']):
    print gcn
    if idx == 19:
        break









    



Main
YdCdHX/VcZaXVjyy
aOcMSp
a/zylasqwjlpbqyrwrr
tljpjunbjwtqlywm/sdnrybknlf
Mainer
Main
mNIJnGIOkm/Payload
aHrMCrboe/chspSxY
Main
OSrAfQWThe/SHLeanN
hhIji/XQDODV
a/dwrwbjyhllzu
H
enudwwlhl/wsshvntsenuwajehdujlchpms
Main
iACVKaBQCV/HhtBSGn
GondadGondadExp
Main
enudwwlhl/yhfwcgjacjjauyvut

Much like the method names, the malicious class names look like gibberish. They also seem shorter and do not have a lot of slashes.



In [159]:

    
df.boxplot('class_name_length', 'label')
plt.ylabel('Class Name Length')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[159]:





<matplotlib.text.Text at 0x12b7d9590>



In [160]:

    
df.boxplot('class_name_slash_count', 'label')
plt.ylabel('Class Name Slash Count')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[160]:





<matplotlib.text.Text at 0x12b64ee10>



In [161]:

    
df.boxplot('class_name_lowercase_run_longest', 'label')
plt.ylabel('Max Run of Lower Case Letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[161]:





<matplotlib.text.Text at 0x11278b750>



In [162]:

    
df.boxplot('class_name_lowercase_run_avg', 'label')
plt.ylabel('Avg Run of Lower Case Letters')
plt.xlabel('')
plt.title('')
plt.suptitle('')









    Out[162]:





<matplotlib.text.Text at 0x1127ce7d0>



In [163]:

    
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

clf_all = sklearn.ensemble.RandomForestClassifier(n_estimators=75)
all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
                    'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
                    'entropy', 'interface_count', 'major version', 'methods_count',
                    'size', 'minor version',
                    'method_name_digit_run_avg', 'method_name_digit_run_longest',
                    'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
                    'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest',
                    'class_name_digit_run_avg', 'class_name_digit_run_longest',
                    'class_name_length', 'class_name_lowercase_run_avg',
                    'class_name_lowercase_run_longest', 'class_name_slash_count',
                    'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest']                

X = df.as_matrix(all_features)
y = np.array(df['label'].tolist())
labels = ['good', 'bad']

scores = sklearn.cross_validation.cross_val_score(clf_all, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))









    



Accuracy: 0.993 (+/- 0.018)

Finally some significant improvment.



In [172]:

    
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_all.fit(X_train, y_train)
y_pred = clf_all.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)









    



Confusion Matrix Stats
benign/benign: 97.06% (99/102)
benign/malicious: 2.94% (3/102)
malicious/benign: 0.00% (0/102)
malicious/malicious: 100.00% (102/102)

One of the features of the classifier is the ability to give a probability that the feature set belongs to a specific class. In the code below, we are asking the classifier to give us the probability the feature set (file) belongs to the malicious class and only marking the file as malicious if 80% of the trees declare it malicious.



In [173]:

    
y_probs = clf_all.predict_proba(X_test)[:,1]
thres = .80 # This can be set to whatever you'd like
y_pred[y_probs>thres] = 'malicious'
y_pred[y_probs<=thres] = 'benign'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)









    



Confusion Matrix Stats
benign/benign: 99.02% (101/102)
benign/malicious: 0.98% (1/102)
malicious/benign: 2.94% (3/102)
malicious/malicious: 97.06% (99/102)



In [9]:

    
#### We do the same, but set the threshold lower, to only 20%



In [174]:

    
y_probs = clf_all.predict_proba(X_test)[:,1]
thres = .20 # This can be set to whatever you'd like
y_pred[y_probs>thres] = 'malicious'
y_pred[y_probs<=thres] = 'benign'
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)









    



Confusion Matrix Stats
benign/benign: 95.10% (97/102)
benign/malicious: 4.90% (5/102)
malicious/benign: 0.00% (0/102)
malicious/malicious: 100.00% (102/102)

You can see that we still get good results even when modifying the threshold level. When we plot the scores, we see that most of the scores are either near 0 or 1. This indicates that the classifier is confident about most of the labels.



In [175]:

    
scores = clf_all.predict_proba(X_test)[:,1]
plt.hist(scores, bins=20)
plt.grid(True)
plt.show()

We ask the classifier to tell the importance again, and we see the features extracted from the class name now totally dominate the classifier. On one hand, we got great results with this classifier, on the other, we see that is almost completely reliant on one major feature of the file.



In [176]:

    
importances = zip(all_features, clf_all.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
sum = 0
for idx, im in enumerate(importances):
    sum += round(im[1], 5)
    print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), sum









    



1:   class_name_slash_count              0.31448 0.31448
2:   class_name_length                   0.28798 0.60246
3:   class_name_lowercase_run_longest    0.08185 0.68431
4:   entropy                             0.06346 0.74777
5:   class_name_lowercase_run_avg        0.06043 0.8082
6:   constant_pool_count                 0.03532 0.84352
7:   size                                0.02862 0.87214
8:   class_name_uppercase_run_longest    0.02839 0.90053
9:   class_name_uppercase_run_avg        0.02537 0.9259
10:  method_name_lowercase_run_avg       0.01137 0.93727
11:  interface_count                     0.01073 0.948
12:  class_name_digit_run_avg            0.00947 0.95747
13:  method_name_lowercase_run_longest   0.00751 0.96498
14:  acc_public                          0.00604 0.97102
15:  methods_count                       0.00509 0.97611
16:  method_name_uppercase_run_longest   0.00459 0.9807
17:  ap_count                            0.0039 0.9846
18:  class_name_digit_run_longest        0.00336 0.98796
19:  major version                       0.00286 0.99082
20:  method_name_uppercase_run_avg       0.00277 0.99359
21:  acc_abstract                        0.00181 0.9954
22:  method_name_digit_run_avg           0.00148 0.99688
23:  acc_final                           0.00105 0.99793
24:  method_name_digit_run_longest       0.00103 0.99896
25:  minor version                       0.00054 0.9995
26:  acc_interface                       0.0005 1.0
27:  acc_super                           1e-05 1.00001
28:  acc_annotation                      0.0 1.00001
29:  acc_enum                            0.0 1.00001
30:  acc_synthetic                       0.0 1.00001

Using a different classifier

Let's try a different classifier: Extra Trees Classifier (like RandomForest, but even more random).
http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html



In [179]:

    
clf_er = sklearn.ensemble.ExtraTreesClassifier(n_estimators=50)
X_er = df.as_matrix(all_features)
y_er = np.array(df['label'].tolist())
labels = ['benign', 'malicious']

scores = sklearn.cross_validation.cross_val_score(clf_er, X_er, y_er, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))









    



Accuracy: 0.997 (+/- 0.013)



In [180]:

    
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X_er, y_er, test_size=my_tsize, random_state=my_seed)
clf_er.fit(X_train, y_train)
y_pred = clf_er.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)









    



Confusion Matrix Stats
benign/benign: 100.00% (90/90)
benign/malicious: 0.00% (0/90)
malicious/benign: 0.00% (0/114)
malicious/malicious: 100.00% (114/114)



In [181]:

    
import sklearn.svm
import sklearn.preprocessing
clf_svc = sklearn.svm.SVC()
X_svc = df.as_matrix(all_features)
X_svc = sklearn.preprocessing.scale(X_svc)
y_svc = np.array(df['label'].tolist())
labels = ['benign', 'malicious']

scores = sklearn.cross_validation.cross_val_score(clf_svc, X_svc, y_svc, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))









    



Accuracy: 0.993 (+/- 0.015)



In [182]:

    
# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X_svc, y_svc, test_size=my_tsize, random_state=my_seed)
clf_svc.fit(X_train, y_train)
y_pred = clf_svc.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)









    



Confusion Matrix Stats
benign/benign: 98.89% (89/90)
benign/malicious: 1.11% (1/90)
malicious/benign: 0.00% (0/114)
malicious/malicious: 100.00% (114/114)



In [183]:

    
# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf_all, X_all, y_all, cv=20)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))









    



Accuracy: 0.992 (+/- 0.029)



In [184]:

    
# Now we can use scikit learn's cross validation to assess predictive performance.
scores = sklearn.cross_validation.cross_val_score(clf_er, X_er, y_er, cv=20)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))









    



Accuracy: 0.995 (+/- 0.017)

Testing on a large corpus of files

In the next couple steps, we train a classifier on all the data, and then test it on approximately 366K files. This data is not labeled, but I would expect them all to be benign. Maybe a few malicious ones, but a small number.



In [185]:

    
clf_everything = sklearn.ensemble.RandomForestClassifier(n_estimators=50)
all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
                    'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count',
                    'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version',
                    'method_name_digit_run_avg', 'method_name_digit_run_longest',
                    'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
                    'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest',
                    'class_name_digit_run_avg', 'class_name_digit_run_longest',
                    'class_name_length', 'class_name_lowercase_run_avg',
                    'class_name_lowercase_run_longest', 'class_name_slash_count',
                    'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest'] 

X_all = df.as_matrix(all_features)
y_all = np.array(df['label'].tolist())

clf_everything.fit(X_all, y_all)









    Out[185]:





RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)



In [190]:

    
java_big_pile_df = pd.read_hdf('data/java_clean_df.hd5', 'table')









    



/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:520: DeprecationWarning: openFile() is pending deprecation, use open_file() instead. You may use the pt2to3 tool to update your source code.
  self._handle = tables.openFile(self._path, self._mode, **kwargs)
/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:1017: DeprecationWarning: getNode() is pending deprecation, use get_node() instead. You may use the pt2to3 tool to update your source code.
  return self._handle.getNode(self.root, key)

Again, one of the features of the classifier is the ability to give a probability that the feature set belongs to a specific class. In the code below, we are asking the classifier to give us the probability the feature set (file) belongs to the malicious class. We interpret these results as anything less the 0.5 is benign, 0.5 - 0.8 is a gray area, and 0.8 and above is malicious.



In [191]:

    
clean = 0
gray = 0
bad = 0
for x in java_big_pile_df.as_matrix(all_features):
    try:
        score = clf_everything.predict_proba(x)[:,1][0]
        if score < 0.5:
            clean += 1
        elif score < 0.8:
            gray += 1
        else:
            bad += 1
    except:
        print "Sad"
        print x
        break

print java_big_pile_df.shape
print clean
print gray
print bad









    



['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface', 'acc_public', 'acc_super', 'acc_synthetic', 'ap_count', 'constant_pool_count', 'entropy', 'size', 'interface_count', 'major version', 'methods_count', 'minor version', 'method_name_digit_run_avg', 'method_name_digit_run_longest', 'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest', 'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest', 'class_name_digit_run_avg', 'class_name_digit_run_longest', 'class_name_length', 'class_name_lowercase_run_avg', 'class_name_lowercase_run_longest', 'class_name_slash_count', 'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest']
(366341, 35)
339771
10971
15599

Wow. That looks mostly horrible. We did only start with 500 files of each, maybe more training data will help. We expand to about 2,000 malicious files and 2,000 benign files. We randomly select 2000 files from the large corpus as label them as benign.



In [219]:

    
java_more_bad_df = pd.read_hdf('data/java_malicious_df.hd5', 'table')









    



/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:520: DeprecationWarning: openFile() is pending deprecation, use open_file() instead. You may use the pt2to3 tool to update your source code.
  self._handle = tables.openFile(self._path, self._mode, **kwargs)
/opt/visiblerisk/lib/python2.7/site-packages/pandas/io/pytables.py:1017: DeprecationWarning: getNode() is pending deprecation, use get_node() instead. You may use the pt2to3 tool to update your source code.
  return self._handle.getNode(self.root, key)



In [220]:

    
java_big_pile_df.head()









    Out[220]:






  
    
      
      acc_abstract
      acc_annotation
      acc_enum
      acc_final
      acc_interface
      acc_public
      acc_super
      acc_synthetic
      ap_count
      attributes count
      class name
      class_name_digit_run_avg
      class_name_digit_run_longest
      class_name_length
      class_name_lowercase_run_avg
      class_name_lowercase_run_longest
      class_name_slash_count
      class_name_uppercase_run_avg
      class_name_uppercase_run_longest
      constant_pool_count
      
    
  
  
    
      0
       0
       0
       0
       0
       0
       1
       1
       0
       2
       1
           com/jidesoft/combobox/DateChooserPanel
       0
       0
       38
       5.333333
       8
       3
       1
       1
       1037
      ...
    
    
      1
       1
       0
       0
       0
       1
       1
       0
       0
       3
       0
                   org/jmol/modelset/BondIterator
       0
       0
       30
       5.000000
       8
       3
       1
       1
         11
      ...
    
    
      2
       0
       0
       0
       0
       0
       1
       1
       0
       2
       2
       org/hibernate/engine/query/ParameterParser
       0
       0
       42
       6.000000
       9
       4
       1
       1
        152
      ...
    
    
      3
       0
       0
       0
       0
       0
       1
       1
       0
       2
       1
                       com/intellij/updater/Utils
       0
       0
       26
       5.500000
       8
       3
       1
       1
        330
      ...
    
    
      4
       0
       0
       0
       0
       0
       1
       1
       0
       2
       1
              com/kiwisoft/db/driver/SybaseDriver
       0
       0
       35
       4.833333
       8
       4
       1
       1
        151
      ...
    
  

5 rows × 35 columns



In [221]:

    
java_big_pile_df['class_name_length'].describe()









    Out[221]:





count    366341.000000
mean         48.081181
std          19.812234
min           1.000000
25%          36.000000
50%          47.000000
75%          61.000000
max         161.000000
Name: class_name_length, dtype: float64

Randomize list



In [239]:

    
java_random_df = java_big_pile_df.reindex(np.random.permutation(java_big_pile_df.index))
java_random_2k_df = java_random_df[0:2000]
java_random_the_rest_df = java_random_df[2000:]



In [240]:

    
java_random_2k_df['label'] = 'benign'



In [241]:

    
java_more_bad_df['label'] = 'malicious'



In [242]:

    
java_4k_df = pd.concat([java_more_bad_df, java_random_2k_df], ignore_index=True)
java_4k_df.fillna(0, inplace=True)



In [243]:

    
clf_4k = sklearn.ensemble.RandomForestClassifier(n_estimators=75)
all_features = ['acc_abstract', 'acc_annotation', 'acc_enum', 'acc_final', 'acc_interface',
                'acc_public', 'acc_super', 'acc_synthetic', 'ap_count',
                'class_name_digit_run_avg', 'class_name_digit_run_longest',
                'class_name_length', 'class_name_lowercase_run_avg',
                'class_name_lowercase_run_longest', 'class_name_slash_count',
                'class_name_uppercase_run_avg', 'class_name_uppercase_run_longest',
                'constant_pool_count', 'entropy', 'interface_count', 'major version',
                'method_name_digit_run_avg', 'method_name_digit_run_longest',
                'method_name_lowercase_run_avg', 'method_name_lowercase_run_longest',
                'method_name_uppercase_run_avg', 'method_name_uppercase_run_longest',
                'methods_count', 'minor version', 'size']

X = java_4k_df.as_matrix(all_features)
y = np.array(java_4k_df['label'].tolist())

scores = sklearn.cross_validation.cross_val_score(clf_4k, X, y, cv=10)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))









    



Accuracy: 0.989 (+/- 0.008)

Testing out this new classifier still shows us good results.



In [244]:

    
import sklearn.ensemble
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split

# 80/20 Split for predictive test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
clf_4k.fit(X_train, y_train)
y_pred = clf_4k.predict(X_test)
labels = ['benign', 'malicious']
cm = confusion_matrix(y_test, y_pred, labels)
plot_cm(cm, labels)









    



Confusion Matrix Stats
benign/benign: 99.00% (398/402)
benign/malicious: 1.00% (4/402)
malicious/benign: 0.79% (3/382)
malicious/malicious: 99.21% (379/382)



In [245]:

    
# Feature Selection
# Which features best deferentiated the two classes?
# Here we're going to grab the feature_importances from the classifier itself, 
importances = zip(all_features, clf_4k.feature_importances_)
importances.sort(key=lambda k:k[1], reverse=True)
sum = 0
for idx, im in enumerate(importances):
    sum += round(im[1], 5)
    print (str(idx+1) + ':').ljust(4), im[0].ljust(35), round(im[1], 5), sum









    



1:   class_name_slash_count              0.25082 0.25082
2:   class_name_length                   0.22822 0.47904
3:   entropy                             0.0733 0.55234
4:   constant_pool_count                 0.06575 0.61809
5:   class_name_uppercase_run_avg        0.06179 0.67988
6:   size                                0.05436 0.73424
7:   class_name_lowercase_run_longest    0.05158 0.78582
8:   class_name_uppercase_run_longest    0.04584 0.83166
9:   method_name_lowercase_run_longest   0.03077 0.86243
10:  method_name_lowercase_run_avg       0.02461 0.88704
11:  class_name_lowercase_run_avg        0.02118 0.90822
12:  interface_count                     0.01524 0.92346
13:  major version                       0.01305 0.93651
14:  method_name_uppercase_run_longest   0.01267 0.94918
15:  method_name_uppercase_run_avg       0.01198 0.96116
16:  methods_count                       0.01062 0.97178
17:  ap_count                            0.00603 0.97781
18:  class_name_digit_run_avg            0.00561 0.98342
19:  minor version                       0.00547 0.98889
20:  acc_public                          0.00262 0.99151
21:  acc_abstract                        0.00221 0.99372
22:  class_name_digit_run_longest        0.00133 0.99505
23:  acc_super                           0.00132 0.99637
24:  acc_final                           0.00128 0.99765
25:  method_name_digit_run_avg           0.00089 0.99854
26:  method_name_digit_run_longest       0.0008 0.99934
27:  acc_interface                       0.00065 0.99999
28:  acc_annotation                      0.0 0.99999
29:  acc_enum                            0.0 0.99999
30:  acc_synthetic                       0.0 0.99999

Let's test on the large corpus again.



In [246]:

    
clf_everything_4k = sklearn.ensemble.RandomForestClassifier(n_estimators=50)

X_all = java_4k_df.as_matrix(all_features)
y_all = np.array(java_4k_df['label'].tolist())

clf_everything_4k.fit(X_all, y_all)









    Out[246]:





RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=50, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)



In [247]:

    
clean = 0
gray = 0
bad = 0
X_rest = java_random_the_rest_df.as_matrix(all_features)
for x in X_rest:
    score = clf_everything_4k.predict_proba(x)[:,1][0]
    if score < 0.5:
        clean += 1
    elif score < 0.8:
        gray += 1
    else:
        bad += 1

print java_random_the_rest_df.shape[0]
print clean
print gray
print bad

That's much better. But still a large number of files in the gray area and marked as malicious. More than I would expect. This means we either need more training data, or better features.



In [ ]:

	acc_abstract	acc_final	acc_public	acc_super	ap_count	class name	class_name_digit_run_avg	class_name_digit_run_longest	class_name_length	class_name_lowercase_run_avg	class_name_lowercase_run_longest	class_name_slash_count	class_name_uppercase_run_avg	class_name_uppercase_run_longest	constant_pool_count	entropy
0	1	0	1	1	3	com/google/common/collect/ForwardingConcurrentMap	0	0	49	6.000000	9	4	1.0	1	54	4.990507	...
1	0	0	0	1	1	org/apache/hadoop/io/compress/GzipCodec$GzipOu...	0	0	82	4.846154	8	5	1.5	5	39	5.205063	...
2	0	1	0	1	2	com/google/common/collect/Multisets$Unmodifiab...	0	0	62	6.625000	11	4	1.0	1	131	4.996721	...
3	0	0	0	1	1	hu/openig/mechanics/StaticDefensePlanner$1	1	1	42	5.666667	9	3	1.0	1	56	5.282413	...
4	0	0	1	1	2	org/apache/commons/io/LineIterator	0	0	34	4.666667	7	4	1.0	1	95	5.285082	...

	acc_public	acc_super	ap_count	class name	class_name_length	class_name_lowercase_run_avg	class_name_lowercase_run_longest	class_name_slash_count	class_name_uppercase_run_avg	class_name_uppercase_run_longest	constant_pool_count	entropy	interfaces	major version	method names
0	1	1	2	Main	4	3.0	3	0	1.000000	1	86	6.114522	[]	48	[<init>, init]	...
1	0	1	1	YdCdHX/VcZaXVjyy	16	1.4	3	1	1.333333	2	52	5.539514	[]	49	[<init>, ktCgxlqo, <clinit>]	...
2	1	1	2	aOcMSp	6	1.0	1	0	1.500000	2	159	5.953528	[]	49	[<init>, gvuNr, <clinit>]	...
3	0	1	1	a/zylasqwjlpbqyrwrr	19	9.0	17	1	0.000000	0	478	6.348531	[]	49	[<init>, eiaxyercdfvbgscpbv, yginlmcynkyuohnfh...	...
4	1	1	2	tljpjunbjwtqlywm/sdnrybknlf	27	13.0	16	1	0.000000	0	122	5.376762	[]	49	[<init>, dvvwse, <clinit>]	...

	method name
0	<init>
1	init
2	ktCgxlqo
3	<clinit>
4	gvuNr
5	eiaxyercdfvbgscpbv
6	yginlmcynkyuohnfhe
7	mtyvzetsjhvnbyz
8	fxxhgjttqfavlooxcb
9	wyjgamzmowywjihkuuf
10	kgthsnqdqutacivcptong
11	qgasjqrogibkblyzourtq
12	glfouhczfxzyskaystx
13	mikczoanebdkwpyb
14	bwssduenvebnvgix
15	wafrcwijizypmitodmb
16	bfznyeevclzzxxqbw
17	jmzisxwtxhekbkl
18	szivddjiptybevduli
19	forwnxmgnutbtdwvptj
20	mwwmrvljafpkwzdiy
21	vvpbdzrhvvnzaieyi
22	qkkxoygluwwlnwbxu
23	dvvwse
24	c
25	k
26	main
27	writeEmbeddedFile
28	bootstrap
29	getJreExecutable
30	addExtension
31	findInDir
32	normalize
33	dissect
34	class$
35	tgznSIAR
36	kWfVWtw
37	BodFzDax
38	xXVBwx
39	VdJiGyZfj
40	taddhnwrkj
41	C
42	ALLATORI_DEMO
43	jvsamhqyvgekftsj
44	knjkb
45	B
46	cmjnkr
47	jmdpes
48	tqffjybms
49	vtvtmh

	method name
0	<init>
1	delegate
2	putIfAbsent
3	remove
4	replace
5	resetState
6	comparator
7	createElementSet
8	elementSet
9	descendingMultiset
10	firstEntry
11	lastEntry
12	pollFirstEntry
13	pollLastEntry
14	headMultiset
15	subMultiset
16	tailMultiset
17	invoke
18	hasNext
19	isValidLine
20	next
21	nextLine
22	close
23	closeQuietly
24	exec
25	getInitial
26	getIntermed
27	getFinal
28	max
29	outputSchema
30	estimateLength
31	appendTo
32	getXPath
33	run
34	secToHMS
35	contribute
36	onBeforeRender
37	setCloseEvent
38	setSelectEvent
39	setChangeEvent
40	setSource
41	statement
42	setDocumentLocator
43	startDocument
44	endDocument
45	startPrefixMapping
46	endPrefixMapping
47	startElement
48	endElement
49	characters

	acc_abstract	acc_interface	acc_public	acc_super	ap_count	attributes count	class name	class_name_length	class_name_lowercase_run_avg	class_name_lowercase_run_longest	class_name_slash_count	class_name_uppercase_run_avg	class_name_uppercase_run_longest	constant_pool_count
0	0	0	1	1	2	1	com/jidesoft/combobox/DateChooserPanel	38	5.333333	8	3	1	1	1037	...
1	1	1	1	0	3	0	org/jmol/modelset/BondIterator	30	5.000000	8	3	1	1	11	...
2	0	0	1	1	2	2	org/hibernate/engine/query/ParameterParser	42	6.000000	9	4	1	1	152	...
3	0	0	1	1	2	1	com/intellij/updater/Utils	26	5.500000	8	3	1	1	330	...
4	0	0	1	1	2	1	com/kiwisoft/db/driver/SybaseDriver	35	4.833333	8	4	1	1	151	...